|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.619987269255251, |
|
"eval_steps": 50, |
|
"global_step": 5200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06365372374283895, |
|
"grad_norm": 8.618297576904297, |
|
"learning_rate": 1.2738853503184715e-06, |
|
"loss": 1.5929, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06365372374283895, |
|
"eval_loss": 0.8718269467353821, |
|
"eval_runtime": 1.9665, |
|
"eval_samples_per_second": 79.328, |
|
"eval_steps_per_second": 4.068, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1273074474856779, |
|
"grad_norm": 13.18759822845459, |
|
"learning_rate": 2.547770700636943e-06, |
|
"loss": 0.7086, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1273074474856779, |
|
"eval_loss": 0.7185815572738647, |
|
"eval_runtime": 1.9537, |
|
"eval_samples_per_second": 79.847, |
|
"eval_steps_per_second": 4.095, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19096117122851686, |
|
"grad_norm": 7.225034236907959, |
|
"learning_rate": 3.821656050955415e-06, |
|
"loss": 0.6164, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19096117122851686, |
|
"eval_loss": 0.7371894717216492, |
|
"eval_runtime": 1.9509, |
|
"eval_samples_per_second": 79.963, |
|
"eval_steps_per_second": 4.101, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2546148949713558, |
|
"grad_norm": 3.914799213409424, |
|
"learning_rate": 5.095541401273886e-06, |
|
"loss": 0.6015, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2546148949713558, |
|
"eval_loss": 0.6896682381629944, |
|
"eval_runtime": 1.9812, |
|
"eval_samples_per_second": 78.74, |
|
"eval_steps_per_second": 4.038, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.31826861871419476, |
|
"grad_norm": 4.877476215362549, |
|
"learning_rate": 6.369426751592357e-06, |
|
"loss": 0.6245, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.31826861871419476, |
|
"eval_loss": 0.6881433725357056, |
|
"eval_runtime": 1.9896, |
|
"eval_samples_per_second": 78.407, |
|
"eval_steps_per_second": 4.021, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3819223424570337, |
|
"grad_norm": 4.47362756729126, |
|
"learning_rate": 7.64331210191083e-06, |
|
"loss": 0.5852, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3819223424570337, |
|
"eval_loss": 0.6920483708381653, |
|
"eval_runtime": 1.9486, |
|
"eval_samples_per_second": 80.059, |
|
"eval_steps_per_second": 4.106, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.44557606619987267, |
|
"grad_norm": 3.484682559967041, |
|
"learning_rate": 8.9171974522293e-06, |
|
"loss": 0.6307, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.44557606619987267, |
|
"eval_loss": 0.6990864276885986, |
|
"eval_runtime": 1.9831, |
|
"eval_samples_per_second": 78.663, |
|
"eval_steps_per_second": 4.034, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5092297899427116, |
|
"grad_norm": 3.779184103012085, |
|
"learning_rate": 1.0191082802547772e-05, |
|
"loss": 0.6639, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5092297899427116, |
|
"eval_loss": 0.7543805241584778, |
|
"eval_runtime": 1.952, |
|
"eval_samples_per_second": 79.916, |
|
"eval_steps_per_second": 4.098, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5728835136855506, |
|
"grad_norm": 2.8436331748962402, |
|
"learning_rate": 1.1464968152866242e-05, |
|
"loss": 0.6431, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5728835136855506, |
|
"eval_loss": 0.7171684503555298, |
|
"eval_runtime": 1.993, |
|
"eval_samples_per_second": 78.274, |
|
"eval_steps_per_second": 4.014, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6365372374283895, |
|
"grad_norm": 3.409532070159912, |
|
"learning_rate": 1.2738853503184714e-05, |
|
"loss": 0.6183, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6365372374283895, |
|
"eval_loss": 0.7295389175415039, |
|
"eval_runtime": 1.9547, |
|
"eval_samples_per_second": 79.809, |
|
"eval_steps_per_second": 4.093, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7001909611712285, |
|
"grad_norm": 2.2884674072265625, |
|
"learning_rate": 1.4012738853503186e-05, |
|
"loss": 0.6344, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7001909611712285, |
|
"eval_loss": 0.7278215885162354, |
|
"eval_runtime": 1.9597, |
|
"eval_samples_per_second": 79.605, |
|
"eval_steps_per_second": 4.082, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7638446849140674, |
|
"grad_norm": 2.662555694580078, |
|
"learning_rate": 1.528662420382166e-05, |
|
"loss": 0.6635, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7638446849140674, |
|
"eval_loss": 0.7314972281455994, |
|
"eval_runtime": 1.9527, |
|
"eval_samples_per_second": 79.889, |
|
"eval_steps_per_second": 4.097, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8274984086569064, |
|
"grad_norm": 2.6137986183166504, |
|
"learning_rate": 1.6560509554140128e-05, |
|
"loss": 0.7083, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8274984086569064, |
|
"eval_loss": 0.7786588668823242, |
|
"eval_runtime": 1.9552, |
|
"eval_samples_per_second": 79.787, |
|
"eval_steps_per_second": 4.092, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8911521323997453, |
|
"grad_norm": 3.339657783508301, |
|
"learning_rate": 1.78343949044586e-05, |
|
"loss": 0.6841, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8911521323997453, |
|
"eval_loss": 0.752537727355957, |
|
"eval_runtime": 1.9503, |
|
"eval_samples_per_second": 79.986, |
|
"eval_steps_per_second": 4.102, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9548058561425843, |
|
"grad_norm": 2.102302312850952, |
|
"learning_rate": 1.910828025477707e-05, |
|
"loss": 0.6622, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9548058561425843, |
|
"eval_loss": 0.7738191485404968, |
|
"eval_runtime": 1.9545, |
|
"eval_samples_per_second": 79.818, |
|
"eval_steps_per_second": 4.093, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0184595798854232, |
|
"grad_norm": 4.052014350891113, |
|
"learning_rate": 1.999977755311665e-05, |
|
"loss": 0.6277, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0184595798854232, |
|
"eval_loss": 0.7977485656738281, |
|
"eval_runtime": 1.9587, |
|
"eval_samples_per_second": 79.643, |
|
"eval_steps_per_second": 4.084, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0821133036282622, |
|
"grad_norm": 2.9770467281341553, |
|
"learning_rate": 1.99958232171617e-05, |
|
"loss": 0.4164, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0821133036282622, |
|
"eval_loss": 0.8168254494667053, |
|
"eval_runtime": 1.9783, |
|
"eval_samples_per_second": 78.855, |
|
"eval_steps_per_second": 4.044, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1457670273711011, |
|
"grad_norm": 2.4140615463256836, |
|
"learning_rate": 1.9986927867052226e-05, |
|
"loss": 0.3963, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1457670273711011, |
|
"eval_loss": 0.8188236355781555, |
|
"eval_runtime": 1.9525, |
|
"eval_samples_per_second": 79.898, |
|
"eval_steps_per_second": 4.097, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.20942075111394, |
|
"grad_norm": 2.735466718673706, |
|
"learning_rate": 1.997309589982985e-05, |
|
"loss": 0.4199, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.20942075111394, |
|
"eval_loss": 0.812458336353302, |
|
"eval_runtime": 1.9598, |
|
"eval_samples_per_second": 79.601, |
|
"eval_steps_per_second": 4.082, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.273074474856779, |
|
"grad_norm": 2.7539219856262207, |
|
"learning_rate": 1.995433415274493e-05, |
|
"loss": 0.4159, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.273074474856779, |
|
"eval_loss": 0.80951327085495, |
|
"eval_runtime": 1.9753, |
|
"eval_samples_per_second": 78.975, |
|
"eval_steps_per_second": 4.05, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.336728198599618, |
|
"grad_norm": 2.275360584259033, |
|
"learning_rate": 1.993065189987684e-05, |
|
"loss": 0.4107, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.336728198599618, |
|
"eval_loss": 0.8287139534950256, |
|
"eval_runtime": 1.9574, |
|
"eval_samples_per_second": 79.696, |
|
"eval_steps_per_second": 4.087, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.400381922342457, |
|
"grad_norm": 2.8758232593536377, |
|
"learning_rate": 1.9902060847549716e-05, |
|
"loss": 0.434, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.400381922342457, |
|
"eval_loss": 0.8138644695281982, |
|
"eval_runtime": 1.9536, |
|
"eval_samples_per_second": 79.852, |
|
"eval_steps_per_second": 4.095, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.464035646085296, |
|
"grad_norm": 2.412508964538574, |
|
"learning_rate": 1.986857512854594e-05, |
|
"loss": 0.4443, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.464035646085296, |
|
"eval_loss": 0.8119045495986938, |
|
"eval_runtime": 1.9894, |
|
"eval_samples_per_second": 78.417, |
|
"eval_steps_per_second": 4.021, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.5276893698281349, |
|
"grad_norm": 1.9511016607284546, |
|
"learning_rate": 1.983021129512019e-05, |
|
"loss": 0.4203, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.5276893698281349, |
|
"eval_loss": 0.8141614198684692, |
|
"eval_runtime": 1.9602, |
|
"eval_samples_per_second": 79.584, |
|
"eval_steps_per_second": 4.081, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.5913430935709738, |
|
"grad_norm": 2.6505215167999268, |
|
"learning_rate": 1.9786988310817523e-05, |
|
"loss": 0.4385, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.5913430935709738, |
|
"eval_loss": 0.8097578883171082, |
|
"eval_runtime": 1.9831, |
|
"eval_samples_per_second": 78.665, |
|
"eval_steps_per_second": 4.034, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.6549968173138128, |
|
"grad_norm": 2.4782555103302, |
|
"learning_rate": 1.973892754109955e-05, |
|
"loss": 0.4625, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.6549968173138128, |
|
"eval_loss": 0.8060766458511353, |
|
"eval_runtime": 1.953, |
|
"eval_samples_per_second": 79.877, |
|
"eval_steps_per_second": 4.096, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.7186505410566517, |
|
"grad_norm": 2.4227771759033203, |
|
"learning_rate": 1.9686052742783324e-05, |
|
"loss": 0.4417, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.7186505410566517, |
|
"eval_loss": 0.8126944303512573, |
|
"eval_runtime": 2.1634, |
|
"eval_samples_per_second": 72.108, |
|
"eval_steps_per_second": 3.698, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.7823042647994907, |
|
"grad_norm": 2.3015785217285156, |
|
"learning_rate": 1.9628390052298155e-05, |
|
"loss": 0.4483, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7823042647994907, |
|
"eval_loss": 0.8172005414962769, |
|
"eval_runtime": 1.9944, |
|
"eval_samples_per_second": 78.219, |
|
"eval_steps_per_second": 4.011, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.8459579885423296, |
|
"grad_norm": 2.7847931385040283, |
|
"learning_rate": 1.9565967972766164e-05, |
|
"loss": 0.4551, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.8459579885423296, |
|
"eval_loss": 0.8149409890174866, |
|
"eval_runtime": 1.9515, |
|
"eval_samples_per_second": 79.937, |
|
"eval_steps_per_second": 4.099, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.9096117122851686, |
|
"grad_norm": 2.573996067047119, |
|
"learning_rate": 1.9498817359912973e-05, |
|
"loss": 0.4526, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9096117122851686, |
|
"eval_loss": 0.8210792541503906, |
|
"eval_runtime": 1.957, |
|
"eval_samples_per_second": 79.715, |
|
"eval_steps_per_second": 4.088, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9732654360280075, |
|
"grad_norm": 2.4847145080566406, |
|
"learning_rate": 1.9426971406815464e-05, |
|
"loss": 0.436, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.9732654360280075, |
|
"eval_loss": 0.8161709904670715, |
|
"eval_runtime": 1.9565, |
|
"eval_samples_per_second": 79.732, |
|
"eval_steps_per_second": 4.089, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.0369191597708465, |
|
"grad_norm": 2.272801637649536, |
|
"learning_rate": 1.9350465627494196e-05, |
|
"loss": 0.3299, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.0369191597708465, |
|
"eval_loss": 0.8792819380760193, |
|
"eval_runtime": 1.9865, |
|
"eval_samples_per_second": 78.531, |
|
"eval_steps_per_second": 4.027, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.1005728835136854, |
|
"grad_norm": 2.545703172683716, |
|
"learning_rate": 1.926933783935853e-05, |
|
"loss": 0.2247, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.1005728835136854, |
|
"eval_loss": 0.89618319272995, |
|
"eval_runtime": 1.9719, |
|
"eval_samples_per_second": 79.113, |
|
"eval_steps_per_second": 4.057, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.1642266072565244, |
|
"grad_norm": 1.6922868490219116, |
|
"learning_rate": 1.9183628144513196e-05, |
|
"loss": 0.2404, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.1642266072565244, |
|
"eval_loss": 0.8865496516227722, |
|
"eval_runtime": 1.9548, |
|
"eval_samples_per_second": 79.805, |
|
"eval_steps_per_second": 4.093, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.2278803309993633, |
|
"grad_norm": 1.7290230989456177, |
|
"learning_rate": 1.9093378909935503e-05, |
|
"loss": 0.226, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.2278803309993633, |
|
"eval_loss": 0.9072224497795105, |
|
"eval_runtime": 1.979, |
|
"eval_samples_per_second": 78.828, |
|
"eval_steps_per_second": 4.042, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.2915340547422023, |
|
"grad_norm": 1.6177765130996704, |
|
"learning_rate": 1.8998634746533004e-05, |
|
"loss": 0.2395, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.2915340547422023, |
|
"eval_loss": 0.9168246388435364, |
|
"eval_runtime": 1.956, |
|
"eval_samples_per_second": 79.753, |
|
"eval_steps_per_second": 4.09, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.3551877784850412, |
|
"grad_norm": 2.4058384895324707, |
|
"learning_rate": 1.889944248709198e-05, |
|
"loss": 0.2425, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.3551877784850412, |
|
"eval_loss": 0.9104910492897034, |
|
"eval_runtime": 1.9546, |
|
"eval_samples_per_second": 79.811, |
|
"eval_steps_per_second": 4.093, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.41884150222788, |
|
"grad_norm": 1.7686213254928589, |
|
"learning_rate": 1.8795851163127626e-05, |
|
"loss": 0.2456, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.41884150222788, |
|
"eval_loss": 0.9102558493614197, |
|
"eval_runtime": 2.0139, |
|
"eval_samples_per_second": 77.463, |
|
"eval_steps_per_second": 3.972, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.482495225970719, |
|
"grad_norm": 2.483212947845459, |
|
"learning_rate": 1.8687911980647375e-05, |
|
"loss": 0.241, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.482495225970719, |
|
"eval_loss": 0.930452287197113, |
|
"eval_runtime": 1.964, |
|
"eval_samples_per_second": 79.428, |
|
"eval_steps_per_second": 4.073, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.546148949713558, |
|
"grad_norm": 2.0306715965270996, |
|
"learning_rate": 1.857567829483937e-05, |
|
"loss": 0.2529, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.546148949713558, |
|
"eval_loss": 0.9052493572235107, |
|
"eval_runtime": 1.9617, |
|
"eval_samples_per_second": 79.522, |
|
"eval_steps_per_second": 4.078, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.609802673456397, |
|
"grad_norm": 2.4319934844970703, |
|
"learning_rate": 1.8459205583698598e-05, |
|
"loss": 0.2479, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.609802673456397, |
|
"eval_loss": 0.909498929977417, |
|
"eval_runtime": 1.9526, |
|
"eval_samples_per_second": 79.894, |
|
"eval_steps_per_second": 4.097, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.673456397199236, |
|
"grad_norm": 1.3272260427474976, |
|
"learning_rate": 1.833855142060364e-05, |
|
"loss": 0.2411, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.673456397199236, |
|
"eval_loss": 0.9212460517883301, |
|
"eval_runtime": 1.9521, |
|
"eval_samples_per_second": 79.913, |
|
"eval_steps_per_second": 4.098, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.737110120942075, |
|
"grad_norm": 1.639654278755188, |
|
"learning_rate": 1.8213775445857716e-05, |
|
"loss": 0.2392, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.737110120942075, |
|
"eval_loss": 0.911665678024292, |
|
"eval_runtime": 1.9821, |
|
"eval_samples_per_second": 78.704, |
|
"eval_steps_per_second": 4.036, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.800763844684914, |
|
"grad_norm": 1.5815492868423462, |
|
"learning_rate": 1.808493933720802e-05, |
|
"loss": 0.2568, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.800763844684914, |
|
"eval_loss": 0.910538375377655, |
|
"eval_runtime": 1.9947, |
|
"eval_samples_per_second": 78.207, |
|
"eval_steps_per_second": 4.011, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.864417568427753, |
|
"grad_norm": 1.519374132156372, |
|
"learning_rate": 1.7952106779357922e-05, |
|
"loss": 0.2459, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.864417568427753, |
|
"eval_loss": 0.9174743890762329, |
|
"eval_runtime": 1.9547, |
|
"eval_samples_per_second": 79.81, |
|
"eval_steps_per_second": 4.093, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.928071292170592, |
|
"grad_norm": 2.186408042907715, |
|
"learning_rate": 1.7815343432487094e-05, |
|
"loss": 0.2521, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.928071292170592, |
|
"eval_loss": 0.9191064238548279, |
|
"eval_runtime": 1.972, |
|
"eval_samples_per_second": 79.107, |
|
"eval_steps_per_second": 4.057, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.991725015913431, |
|
"grad_norm": 1.9706707000732422, |
|
"learning_rate": 1.7674716899795205e-05, |
|
"loss": 0.2462, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.991725015913431, |
|
"eval_loss": 0.9158605933189392, |
|
"eval_runtime": 1.9566, |
|
"eval_samples_per_second": 79.73, |
|
"eval_steps_per_second": 4.089, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.0553787396562697, |
|
"grad_norm": 2.1943814754486084, |
|
"learning_rate": 1.753029669408509e-05, |
|
"loss": 0.1712, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.0553787396562697, |
|
"eval_loss": 0.9773328900337219, |
|
"eval_runtime": 1.9608, |
|
"eval_samples_per_second": 79.559, |
|
"eval_steps_per_second": 4.08, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.1190324633991087, |
|
"grad_norm": 1.6751331090927124, |
|
"learning_rate": 1.738215420340205e-05, |
|
"loss": 0.1609, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.1190324633991087, |
|
"eval_loss": 0.9757582545280457, |
|
"eval_runtime": 1.9922, |
|
"eval_samples_per_second": 78.306, |
|
"eval_steps_per_second": 4.016, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.1826861871419476, |
|
"grad_norm": 1.4819265604019165, |
|
"learning_rate": 1.723036265574616e-05, |
|
"loss": 0.1684, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.1826861871419476, |
|
"eval_loss": 0.9819521903991699, |
|
"eval_runtime": 1.9611, |
|
"eval_samples_per_second": 79.546, |
|
"eval_steps_per_second": 4.079, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.2463399108847866, |
|
"grad_norm": 1.6287983655929565, |
|
"learning_rate": 1.7074997082875113e-05, |
|
"loss": 0.1677, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.2463399108847866, |
|
"eval_loss": 0.9783702492713928, |
|
"eval_runtime": 1.9577, |
|
"eval_samples_per_second": 79.684, |
|
"eval_steps_per_second": 4.086, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.3099936346276255, |
|
"grad_norm": 0.7324376106262207, |
|
"learning_rate": 1.6916134283215412e-05, |
|
"loss": 0.1705, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.3099936346276255, |
|
"eval_loss": 0.9991599917411804, |
|
"eval_runtime": 1.9593, |
|
"eval_samples_per_second": 79.622, |
|
"eval_steps_per_second": 4.083, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.373647358370465, |
|
"grad_norm": 2.1236824989318848, |
|
"learning_rate": 1.6753852783900306e-05, |
|
"loss": 0.1679, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.373647358370465, |
|
"eval_loss": 0.9751440286636353, |
|
"eval_runtime": 1.9544, |
|
"eval_samples_per_second": 79.821, |
|
"eval_steps_per_second": 4.093, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.4373010821133034, |
|
"grad_norm": 1.1708719730377197, |
|
"learning_rate": 1.6588232801953223e-05, |
|
"loss": 0.1668, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.4373010821133034, |
|
"eval_loss": 0.9651870727539062, |
|
"eval_runtime": 1.9795, |
|
"eval_samples_per_second": 78.809, |
|
"eval_steps_per_second": 4.041, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.500954805856143, |
|
"grad_norm": 1.5355838537216187, |
|
"learning_rate": 1.641935620463584e-05, |
|
"loss": 0.1673, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.500954805856143, |
|
"eval_loss": 0.9585145115852356, |
|
"eval_runtime": 1.9555, |
|
"eval_samples_per_second": 79.775, |
|
"eval_steps_per_second": 4.091, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.5646085295989813, |
|
"grad_norm": 1.21344792842865, |
|
"learning_rate": 1.6247306468980475e-05, |
|
"loss": 0.1691, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.5646085295989813, |
|
"eval_loss": 0.9902192950248718, |
|
"eval_runtime": 1.9655, |
|
"eval_samples_per_second": 79.368, |
|
"eval_steps_per_second": 4.07, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.6282622533418207, |
|
"grad_norm": 1.3404573202133179, |
|
"learning_rate": 1.607216864052672e-05, |
|
"loss": 0.1707, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.6282622533418207, |
|
"eval_loss": 0.9820557832717896, |
|
"eval_runtime": 1.957, |
|
"eval_samples_per_second": 79.716, |
|
"eval_steps_per_second": 4.088, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.6919159770846592, |
|
"grad_norm": 1.7005574703216553, |
|
"learning_rate": 1.589402929128276e-05, |
|
"loss": 0.1694, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.6919159770846592, |
|
"eval_loss": 0.9860473871231079, |
|
"eval_runtime": 1.9577, |
|
"eval_samples_per_second": 79.685, |
|
"eval_steps_per_second": 4.086, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.7555697008274986, |
|
"grad_norm": 3.1085927486419678, |
|
"learning_rate": 1.571297647693215e-05, |
|
"loss": 0.1693, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.7555697008274986, |
|
"eval_loss": 0.9597901701927185, |
|
"eval_runtime": 1.9583, |
|
"eval_samples_per_second": 79.66, |
|
"eval_steps_per_second": 4.085, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.819223424570337, |
|
"grad_norm": 1.1406304836273193, |
|
"learning_rate": 1.5529099693307205e-05, |
|
"loss": 0.1708, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.819223424570337, |
|
"eval_loss": 0.9565241932868958, |
|
"eval_runtime": 1.9592, |
|
"eval_samples_per_second": 79.623, |
|
"eval_steps_per_second": 4.083, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.8828771483131765, |
|
"grad_norm": 1.1178061962127686, |
|
"learning_rate": 1.534248983215048e-05, |
|
"loss": 0.1671, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.8828771483131765, |
|
"eval_loss": 0.9801331758499146, |
|
"eval_runtime": 1.9614, |
|
"eval_samples_per_second": 79.536, |
|
"eval_steps_per_second": 4.079, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.946530872056015, |
|
"grad_norm": 1.7261744737625122, |
|
"learning_rate": 1.5153239136186297e-05, |
|
"loss": 0.1603, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.946530872056015, |
|
"eval_loss": 0.9833694100379944, |
|
"eval_runtime": 1.9582, |
|
"eval_samples_per_second": 79.664, |
|
"eval_steps_per_second": 4.085, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.0101845957988544, |
|
"grad_norm": 1.4014726877212524, |
|
"learning_rate": 1.4961441153524421e-05, |
|
"loss": 0.1604, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.0101845957988544, |
|
"eval_loss": 1.0256284475326538, |
|
"eval_runtime": 1.9881, |
|
"eval_samples_per_second": 78.467, |
|
"eval_steps_per_second": 4.024, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.073838319541693, |
|
"grad_norm": 0.9033142924308777, |
|
"learning_rate": 1.47671906914185e-05, |
|
"loss": 0.1046, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.073838319541693, |
|
"eval_loss": 1.0567926168441772, |
|
"eval_runtime": 1.9551, |
|
"eval_samples_per_second": 79.79, |
|
"eval_steps_per_second": 4.092, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.137492043284532, |
|
"grad_norm": 0.9893561005592346, |
|
"learning_rate": 1.4570583769402085e-05, |
|
"loss": 0.1069, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.137492043284532, |
|
"eval_loss": 1.0297176837921143, |
|
"eval_runtime": 1.9549, |
|
"eval_samples_per_second": 79.801, |
|
"eval_steps_per_second": 4.092, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.201145767027371, |
|
"grad_norm": 1.3346303701400757, |
|
"learning_rate": 1.437171757182542e-05, |
|
"loss": 0.1032, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.201145767027371, |
|
"eval_loss": 1.0205018520355225, |
|
"eval_runtime": 1.9586, |
|
"eval_samples_per_second": 79.648, |
|
"eval_steps_per_second": 4.084, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.26479949077021, |
|
"grad_norm": 0.8845686316490173, |
|
"learning_rate": 1.4170690399816469e-05, |
|
"loss": 0.1038, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.26479949077021, |
|
"eval_loss": 1.027697205543518, |
|
"eval_runtime": 1.9567, |
|
"eval_samples_per_second": 79.727, |
|
"eval_steps_per_second": 4.089, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.328453214513049, |
|
"grad_norm": 0.8891485333442688, |
|
"learning_rate": 1.3967601622689864e-05, |
|
"loss": 0.1122, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.328453214513049, |
|
"eval_loss": 1.0382416248321533, |
|
"eval_runtime": 1.9585, |
|
"eval_samples_per_second": 79.654, |
|
"eval_steps_per_second": 4.085, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.392106938255888, |
|
"grad_norm": 1.3972492218017578, |
|
"learning_rate": 1.3762551628827892e-05, |
|
"loss": 0.1086, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 4.392106938255888, |
|
"eval_loss": 1.0479986667633057, |
|
"eval_runtime": 1.9854, |
|
"eval_samples_per_second": 78.575, |
|
"eval_steps_per_second": 4.03, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 4.455760661998727, |
|
"grad_norm": 0.9529390335083008, |
|
"learning_rate": 1.3555641776057729e-05, |
|
"loss": 0.1086, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.455760661998727, |
|
"eval_loss": 1.026903748512268, |
|
"eval_runtime": 1.9801, |
|
"eval_samples_per_second": 78.783, |
|
"eval_steps_per_second": 4.04, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.519414385741566, |
|
"grad_norm": 0.9916590452194214, |
|
"learning_rate": 1.3346974341549448e-05, |
|
"loss": 0.1082, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 4.519414385741566, |
|
"eval_loss": 1.0392361879348755, |
|
"eval_runtime": 1.9611, |
|
"eval_samples_per_second": 79.546, |
|
"eval_steps_per_second": 4.079, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 4.583068109484405, |
|
"grad_norm": 1.4986087083816528, |
|
"learning_rate": 1.3136652471259624e-05, |
|
"loss": 0.1067, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.583068109484405, |
|
"eval_loss": 1.014196753501892, |
|
"eval_runtime": 1.9787, |
|
"eval_samples_per_second": 78.839, |
|
"eval_steps_per_second": 4.043, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.646721833227244, |
|
"grad_norm": 1.2164666652679443, |
|
"learning_rate": 1.2924780128945473e-05, |
|
"loss": 0.1124, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 4.646721833227244, |
|
"eval_loss": 1.0109295845031738, |
|
"eval_runtime": 1.9575, |
|
"eval_samples_per_second": 79.695, |
|
"eval_steps_per_second": 4.087, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 4.7103755569700825, |
|
"grad_norm": 0.980490505695343, |
|
"learning_rate": 1.2711462044774747e-05, |
|
"loss": 0.1079, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.7103755569700825, |
|
"eval_loss": 1.025646686553955, |
|
"eval_runtime": 1.9514, |
|
"eval_samples_per_second": 79.941, |
|
"eval_steps_per_second": 4.1, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.774029280712922, |
|
"grad_norm": 1.684493064880371, |
|
"learning_rate": 1.249680366355678e-05, |
|
"loss": 0.1106, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 4.774029280712922, |
|
"eval_loss": 1.0292022228240967, |
|
"eval_runtime": 1.9487, |
|
"eval_samples_per_second": 80.051, |
|
"eval_steps_per_second": 4.105, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 4.83768300445576, |
|
"grad_norm": 0.6794301867485046, |
|
"learning_rate": 1.2280911092620298e-05, |
|
"loss": 0.108, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.83768300445576, |
|
"eval_loss": 1.012285828590393, |
|
"eval_runtime": 1.9548, |
|
"eval_samples_per_second": 79.806, |
|
"eval_steps_per_second": 4.093, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.9013367281986, |
|
"grad_norm": 1.613420009613037, |
|
"learning_rate": 1.2063891049363725e-05, |
|
"loss": 0.1078, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 4.9013367281986, |
|
"eval_loss": 1.029428482055664, |
|
"eval_runtime": 1.9459, |
|
"eval_samples_per_second": 80.167, |
|
"eval_steps_per_second": 4.111, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 4.964990451941438, |
|
"grad_norm": 0.8637195825576782, |
|
"learning_rate": 1.1845850808503939e-05, |
|
"loss": 0.1012, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 4.964990451941438, |
|
"eval_loss": 1.0551037788391113, |
|
"eval_runtime": 1.9584, |
|
"eval_samples_per_second": 79.656, |
|
"eval_steps_per_second": 4.085, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 5.028644175684278, |
|
"grad_norm": 1.0889546871185303, |
|
"learning_rate": 1.1626898149049523e-05, |
|
"loss": 0.0913, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 5.028644175684278, |
|
"eval_loss": 1.0936988592147827, |
|
"eval_runtime": 1.9524, |
|
"eval_samples_per_second": 79.901, |
|
"eval_steps_per_second": 4.097, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 5.092297899427116, |
|
"grad_norm": 1.0244091749191284, |
|
"learning_rate": 1.1407141301024762e-05, |
|
"loss": 0.0699, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.092297899427116, |
|
"eval_loss": 1.0808786153793335, |
|
"eval_runtime": 1.9799, |
|
"eval_samples_per_second": 78.793, |
|
"eval_steps_per_second": 4.041, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.155951623169956, |
|
"grad_norm": 0.9700921773910522, |
|
"learning_rate": 1.1186688891970686e-05, |
|
"loss": 0.0685, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 5.155951623169956, |
|
"eval_loss": 1.0664215087890625, |
|
"eval_runtime": 1.9478, |
|
"eval_samples_per_second": 80.091, |
|
"eval_steps_per_second": 4.107, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 5.219605346912794, |
|
"grad_norm": 0.8142929077148438, |
|
"learning_rate": 1.0965649893249619e-05, |
|
"loss": 0.0667, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 5.219605346912794, |
|
"eval_loss": 1.0777896642684937, |
|
"eval_runtime": 1.9828, |
|
"eval_samples_per_second": 78.676, |
|
"eval_steps_per_second": 4.035, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 5.2832590706556335, |
|
"grad_norm": 0.47455140948295593, |
|
"learning_rate": 1.074413356617978e-05, |
|
"loss": 0.0689, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 5.2832590706556335, |
|
"eval_loss": 1.0918174982070923, |
|
"eval_runtime": 1.9937, |
|
"eval_samples_per_second": 78.247, |
|
"eval_steps_per_second": 4.013, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 5.346912794398472, |
|
"grad_norm": 0.5227736234664917, |
|
"learning_rate": 1.0522249408026553e-05, |
|
"loss": 0.0662, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 5.346912794398472, |
|
"eval_loss": 1.0910652875900269, |
|
"eval_runtime": 1.9443, |
|
"eval_samples_per_second": 80.236, |
|
"eval_steps_per_second": 4.115, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 5.410566518141311, |
|
"grad_norm": 0.9572489261627197, |
|
"learning_rate": 1.0300107097877114e-05, |
|
"loss": 0.066, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 5.410566518141311, |
|
"eval_loss": 1.0842642784118652, |
|
"eval_runtime": 1.9976, |
|
"eval_samples_per_second": 78.093, |
|
"eval_steps_per_second": 4.005, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 5.47422024188415, |
|
"grad_norm": 1.3733848333358765, |
|
"learning_rate": 1.0077816442425216e-05, |
|
"loss": 0.0717, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 5.47422024188415, |
|
"eval_loss": 1.0852998495101929, |
|
"eval_runtime": 1.9568, |
|
"eval_samples_per_second": 79.722, |
|
"eval_steps_per_second": 4.088, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 5.537873965626989, |
|
"grad_norm": 0.3845078945159912, |
|
"learning_rate": 9.85548732169286e-06, |
|
"loss": 0.0668, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 5.537873965626989, |
|
"eval_loss": 1.0928053855895996, |
|
"eval_runtime": 1.9824, |
|
"eval_samples_per_second": 78.692, |
|
"eval_steps_per_second": 4.036, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 5.601527689369828, |
|
"grad_norm": 0.6611254215240479, |
|
"learning_rate": 9.633229634715734e-06, |
|
"loss": 0.0683, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 5.601527689369828, |
|
"eval_loss": 1.08696448802948, |
|
"eval_runtime": 1.948, |
|
"eval_samples_per_second": 80.082, |
|
"eval_steps_per_second": 4.107, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 5.665181413112667, |
|
"grad_norm": 0.2818908095359802, |
|
"learning_rate": 9.411153245219262e-06, |
|
"loss": 0.0672, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 5.665181413112667, |
|
"eval_loss": 1.0971331596374512, |
|
"eval_runtime": 1.9492, |
|
"eval_samples_per_second": 80.034, |
|
"eval_steps_per_second": 4.104, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 5.728835136855506, |
|
"grad_norm": 0.5571719408035278, |
|
"learning_rate": 9.18936792731209e-06, |
|
"loss": 0.0716, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.728835136855506, |
|
"eval_loss": 1.084871530532837, |
|
"eval_runtime": 1.9919, |
|
"eval_samples_per_second": 78.318, |
|
"eval_steps_per_second": 4.016, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.792488860598345, |
|
"grad_norm": 0.5682917237281799, |
|
"learning_rate": 8.967983311223898e-06, |
|
"loss": 0.0699, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 5.792488860598345, |
|
"eval_loss": 1.092268943786621, |
|
"eval_runtime": 2.0128, |
|
"eval_samples_per_second": 77.502, |
|
"eval_steps_per_second": 3.974, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 5.856142584341184, |
|
"grad_norm": 0.7702981233596802, |
|
"learning_rate": 8.747108829114284e-06, |
|
"loss": 0.0731, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 5.856142584341184, |
|
"eval_loss": 1.0890579223632812, |
|
"eval_runtime": 1.9571, |
|
"eval_samples_per_second": 79.711, |
|
"eval_steps_per_second": 4.088, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 5.919796308084023, |
|
"grad_norm": 0.8630861043930054, |
|
"learning_rate": 8.526853660979609e-06, |
|
"loss": 0.0664, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 5.919796308084023, |
|
"eval_loss": 1.0908620357513428, |
|
"eval_runtime": 1.962, |
|
"eval_samples_per_second": 79.511, |
|
"eval_steps_per_second": 4.077, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 5.9834500318268615, |
|
"grad_norm": 1.2592061758041382, |
|
"learning_rate": 8.30732668068446e-06, |
|
"loss": 0.0723, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 5.9834500318268615, |
|
"eval_loss": 1.0808950662612915, |
|
"eval_runtime": 1.9536, |
|
"eval_samples_per_second": 79.853, |
|
"eval_steps_per_second": 4.095, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 6.047103755569701, |
|
"grad_norm": 0.3191361725330353, |
|
"learning_rate": 8.088636402144442e-06, |
|
"loss": 0.0584, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 6.047103755569701, |
|
"eval_loss": 1.1153998374938965, |
|
"eval_runtime": 1.9605, |
|
"eval_samples_per_second": 79.571, |
|
"eval_steps_per_second": 4.081, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 6.110757479312539, |
|
"grad_norm": 0.5758917927742004, |
|
"learning_rate": 7.870890925686875e-06, |
|
"loss": 0.0528, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 6.110757479312539, |
|
"eval_loss": 1.1246304512023926, |
|
"eval_runtime": 1.9862, |
|
"eval_samples_per_second": 78.541, |
|
"eval_steps_per_second": 4.028, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 6.174411203055379, |
|
"grad_norm": 0.8011472821235657, |
|
"learning_rate": 7.654197884615991e-06, |
|
"loss": 0.0549, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 6.174411203055379, |
|
"eval_loss": 1.1302942037582397, |
|
"eval_runtime": 1.9621, |
|
"eval_samples_per_second": 79.507, |
|
"eval_steps_per_second": 4.077, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 6.238064926798217, |
|
"grad_norm": 0.45742979645729065, |
|
"learning_rate": 7.438664392008903e-06, |
|
"loss": 0.0548, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 6.238064926798217, |
|
"eval_loss": 1.1371703147888184, |
|
"eval_runtime": 1.9865, |
|
"eval_samples_per_second": 78.531, |
|
"eval_steps_per_second": 4.027, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 6.301718650541057, |
|
"grad_norm": 1.6437746286392212, |
|
"learning_rate": 7.224396987768785e-06, |
|
"loss": 0.0552, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 6.301718650541057, |
|
"eval_loss": 1.1244136095046997, |
|
"eval_runtime": 1.9966, |
|
"eval_samples_per_second": 78.133, |
|
"eval_steps_per_second": 4.007, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 6.365372374283895, |
|
"grad_norm": 0.2734375298023224, |
|
"learning_rate": 7.011501585961369e-06, |
|
"loss": 0.053, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 6.365372374283895, |
|
"eval_loss": 1.1277772188186646, |
|
"eval_runtime": 1.9932, |
|
"eval_samples_per_second": 78.265, |
|
"eval_steps_per_second": 4.014, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 6.429026098026735, |
|
"grad_norm": 0.3379392623901367, |
|
"learning_rate": 6.800083422460766e-06, |
|
"loss": 0.0556, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 6.429026098026735, |
|
"eval_loss": 1.1319537162780762, |
|
"eval_runtime": 1.9815, |
|
"eval_samples_per_second": 78.728, |
|
"eval_steps_per_second": 4.037, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 6.492679821769573, |
|
"grad_norm": 0.34678924083709717, |
|
"learning_rate": 6.590247002930567e-06, |
|
"loss": 0.0574, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 6.492679821769573, |
|
"eval_loss": 1.112044334411621, |
|
"eval_runtime": 1.9568, |
|
"eval_samples_per_second": 79.721, |
|
"eval_steps_per_second": 4.088, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 6.5563335455124125, |
|
"grad_norm": 0.6663870811462402, |
|
"learning_rate": 6.382096051165847e-06, |
|
"loss": 0.0536, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 6.5563335455124125, |
|
"eval_loss": 1.135048747062683, |
|
"eval_runtime": 1.955, |
|
"eval_samples_per_second": 79.795, |
|
"eval_steps_per_second": 4.092, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 6.619987269255251, |
|
"grad_norm": 0.5493341088294983, |
|
"learning_rate": 6.175733457821691e-06, |
|
"loss": 0.0548, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 6.619987269255251, |
|
"eval_loss": 1.1426846981048584, |
|
"eval_runtime": 1.9543, |
|
"eval_samples_per_second": 79.823, |
|
"eval_steps_per_second": 4.094, |
|
"step": 5200 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 7850, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 400, |
|
"total_flos": 2.0478057288984166e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|