diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4815 @@ +{ + "best_metric": 0.01230713166296482, + "best_model_checkpoint": "/home/paperspace/Data/models/centime/llm3br256/checkpoint-400", + "epoch": 4.992143658810326, + "eval_steps": 5, + "global_step": 555, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008978675645342313, + "grad_norm": 0.12415610253810883, + "learning_rate": 1.7857142857142857e-06, + "loss": 0.0585, + "step": 1 + }, + { + "epoch": 0.017957351290684626, + "grad_norm": 0.11934095621109009, + "learning_rate": 3.5714285714285714e-06, + "loss": 0.0573, + "step": 2 + }, + { + "epoch": 0.026936026936026935, + "grad_norm": 0.12277644127607346, + "learning_rate": 5.357142857142857e-06, + "loss": 0.0583, + "step": 3 + }, + { + "epoch": 0.03591470258136925, + "grad_norm": 0.11886874586343765, + "learning_rate": 7.142857142857143e-06, + "loss": 0.0575, + "step": 4 + }, + { + "epoch": 0.04489337822671156, + "grad_norm": 0.12854638695716858, + "learning_rate": 8.92857142857143e-06, + "loss": 0.0612, + "step": 5 + }, + { + "epoch": 0.04489337822671156, + "eval_loss": 0.05599946156144142, + "eval_runtime": 8.137, + "eval_samples_per_second": 6.145, + "eval_steps_per_second": 1.598, + "step": 5 + }, + { + "epoch": 0.05387205387205387, + "grad_norm": 0.09200441092252731, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.05, + "step": 6 + }, + { + "epoch": 0.06285072951739619, + "grad_norm": 0.0996675118803978, + "learning_rate": 1.25e-05, + "loss": 0.0515, + "step": 7 + }, + { + "epoch": 0.0718294051627385, + "grad_norm": 0.08751285076141357, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.0436, + "step": 8 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 0.07102574408054352, + "learning_rate": 1.6071428571428572e-05, + "loss": 0.044, + "step": 9 + }, + { + "epoch": 0.08978675645342311, + "grad_norm": 0.05904633551836014, + "learning_rate": 1.785714285714286e-05, + "loss": 0.0411, + "step": 10 + }, + { + "epoch": 0.08978675645342311, + "eval_loss": 0.035706527531147, + "eval_runtime": 6.2599, + "eval_samples_per_second": 7.987, + "eval_steps_per_second": 2.077, + "step": 10 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 0.07080914080142975, + "learning_rate": 1.9642857142857145e-05, + "loss": 0.036, + "step": 11 + }, + { + "epoch": 0.10774410774410774, + "grad_norm": 0.07890115678310394, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.0345, + "step": 12 + }, + { + "epoch": 0.11672278338945005, + "grad_norm": 0.06593428552150726, + "learning_rate": 2.3214285714285715e-05, + "loss": 0.0351, + "step": 13 + }, + { + "epoch": 0.12570145903479238, + "grad_norm": 0.04619583487510681, + "learning_rate": 2.5e-05, + "loss": 0.0338, + "step": 14 + }, + { + "epoch": 0.13468013468013468, + "grad_norm": 0.04647090658545494, + "learning_rate": 2.6785714285714288e-05, + "loss": 0.0353, + "step": 15 + }, + { + "epoch": 0.13468013468013468, + "eval_loss": 0.03013807348906994, + "eval_runtime": 6.2491, + "eval_samples_per_second": 8.001, + "eval_steps_per_second": 2.08, + "step": 15 + }, + { + "epoch": 0.143658810325477, + "grad_norm": 0.04043864831328392, + "learning_rate": 2.857142857142857e-05, + "loss": 0.0304, + "step": 16 + }, + { + "epoch": 0.1526374859708193, + "grad_norm": 0.043566230684518814, + "learning_rate": 3.0357142857142857e-05, + "loss": 0.0303, + "step": 17 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 0.042150504887104034, + "learning_rate": 3.2142857142857144e-05, + "loss": 0.0327, + "step": 18 + }, + { + "epoch": 0.17059483726150393, + "grad_norm": 0.03820006549358368, + "learning_rate": 3.392857142857143e-05, + "loss": 0.0271, + "step": 19 + }, + { + "epoch": 0.17957351290684623, + "grad_norm": 0.03358590975403786, + "learning_rate": 3.571428571428572e-05, + "loss": 0.0286, + "step": 20 + }, + { + "epoch": 0.17957351290684623, + "eval_loss": 0.026352621614933014, + "eval_runtime": 6.261, + "eval_samples_per_second": 7.986, + "eval_steps_per_second": 2.076, + "step": 20 + }, + { + "epoch": 0.18855218855218855, + "grad_norm": 0.03568700700998306, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.0293, + "step": 21 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 0.033443234860897064, + "learning_rate": 3.928571428571429e-05, + "loss": 0.027, + "step": 22 + }, + { + "epoch": 0.20650953984287318, + "grad_norm": 0.03302915766835213, + "learning_rate": 4.107142857142857e-05, + "loss": 0.027, + "step": 23 + }, + { + "epoch": 0.21548821548821548, + "grad_norm": 0.03412705287337303, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.0249, + "step": 24 + }, + { + "epoch": 0.2244668911335578, + "grad_norm": 0.03498664125800133, + "learning_rate": 4.464285714285715e-05, + "loss": 0.0282, + "step": 25 + }, + { + "epoch": 0.2244668911335578, + "eval_loss": 0.023908844217658043, + "eval_runtime": 6.2612, + "eval_samples_per_second": 7.986, + "eval_steps_per_second": 2.076, + "step": 25 + }, + { + "epoch": 0.2334455667789001, + "grad_norm": 0.02876153774559498, + "learning_rate": 4.642857142857143e-05, + "loss": 0.0249, + "step": 26 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.03279321268200874, + "learning_rate": 4.8214285714285716e-05, + "loss": 0.0254, + "step": 27 + }, + { + "epoch": 0.25140291806958476, + "grad_norm": 0.03252970054745674, + "learning_rate": 5e-05, + "loss": 0.024, + "step": 28 + }, + { + "epoch": 0.26038159371492703, + "grad_norm": 0.02586694434285164, + "learning_rate": 5.1785714285714296e-05, + "loss": 0.0251, + "step": 29 + }, + { + "epoch": 0.26936026936026936, + "grad_norm": 0.028388267382979393, + "learning_rate": 5.3571428571428575e-05, + "loss": 0.0223, + "step": 30 + }, + { + "epoch": 0.26936026936026936, + "eval_loss": 0.02239508368074894, + "eval_runtime": 6.2554, + "eval_samples_per_second": 7.993, + "eval_steps_per_second": 2.078, + "step": 30 + }, + { + "epoch": 0.2783389450056117, + "grad_norm": 0.040059663355350494, + "learning_rate": 5.535714285714286e-05, + "loss": 0.0281, + "step": 31 + }, + { + "epoch": 0.287317620650954, + "grad_norm": 0.02905772626399994, + "learning_rate": 5.714285714285714e-05, + "loss": 0.0243, + "step": 32 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.028802093118429184, + "learning_rate": 5.8928571428571435e-05, + "loss": 0.0224, + "step": 33 + }, + { + "epoch": 0.3052749719416386, + "grad_norm": 0.031860049813985825, + "learning_rate": 6.0714285714285715e-05, + "loss": 0.0246, + "step": 34 + }, + { + "epoch": 0.31425364758698093, + "grad_norm": 0.029610324651002884, + "learning_rate": 6.25e-05, + "loss": 0.0242, + "step": 35 + }, + { + "epoch": 0.31425364758698093, + "eval_loss": 0.020946728065609932, + "eval_runtime": 6.2517, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.079, + "step": 35 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 0.025372274219989777, + "learning_rate": 6.428571428571429e-05, + "loss": 0.0257, + "step": 36 + }, + { + "epoch": 0.33221099887766553, + "grad_norm": 0.030153121799230576, + "learning_rate": 6.607142857142857e-05, + "loss": 0.0221, + "step": 37 + }, + { + "epoch": 0.34118967452300786, + "grad_norm": 0.023654770106077194, + "learning_rate": 6.785714285714286e-05, + "loss": 0.0201, + "step": 38 + }, + { + "epoch": 0.3501683501683502, + "grad_norm": 0.026218950748443604, + "learning_rate": 6.964285714285715e-05, + "loss": 0.0207, + "step": 39 + }, + { + "epoch": 0.35914702581369246, + "grad_norm": 0.02605343423783779, + "learning_rate": 7.142857142857143e-05, + "loss": 0.0211, + "step": 40 + }, + { + "epoch": 0.35914702581369246, + "eval_loss": 0.020273756235837936, + "eval_runtime": 6.2552, + "eval_samples_per_second": 7.993, + "eval_steps_per_second": 2.078, + "step": 40 + }, + { + "epoch": 0.3681257014590348, + "grad_norm": 0.026552610099315643, + "learning_rate": 7.321428571428571e-05, + "loss": 0.0226, + "step": 41 + }, + { + "epoch": 0.3771043771043771, + "grad_norm": 0.020305411890149117, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0187, + "step": 42 + }, + { + "epoch": 0.38608305274971944, + "grad_norm": 0.023941006511449814, + "learning_rate": 7.67857142857143e-05, + "loss": 0.0213, + "step": 43 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 0.024746833369135857, + "learning_rate": 7.857142857142858e-05, + "loss": 0.0214, + "step": 44 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.018778987228870392, + "learning_rate": 8.035714285714287e-05, + "loss": 0.0178, + "step": 45 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.0200771763920784, + "eval_runtime": 6.2549, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 2.078, + "step": 45 + }, + { + "epoch": 0.41301907968574636, + "grad_norm": 0.02559836022555828, + "learning_rate": 8.214285714285714e-05, + "loss": 0.0186, + "step": 46 + }, + { + "epoch": 0.4219977553310887, + "grad_norm": 0.02603279985487461, + "learning_rate": 8.392857142857144e-05, + "loss": 0.0205, + "step": 47 + }, + { + "epoch": 0.43097643097643096, + "grad_norm": 0.023479154333472252, + "learning_rate": 8.571428571428571e-05, + "loss": 0.0207, + "step": 48 + }, + { + "epoch": 0.4399551066217733, + "grad_norm": 0.027948766946792603, + "learning_rate": 8.75e-05, + "loss": 0.0199, + "step": 49 + }, + { + "epoch": 0.4489337822671156, + "grad_norm": 0.028703948482871056, + "learning_rate": 8.92857142857143e-05, + "loss": 0.0206, + "step": 50 + }, + { + "epoch": 0.4489337822671156, + "eval_loss": 0.019621608778834343, + "eval_runtime": 6.2561, + "eval_samples_per_second": 7.992, + "eval_steps_per_second": 2.078, + "step": 50 + }, + { + "epoch": 0.45791245791245794, + "grad_norm": 0.021676093339920044, + "learning_rate": 9.107142857142857e-05, + "loss": 0.0203, + "step": 51 + }, + { + "epoch": 0.4668911335578002, + "grad_norm": 0.02454349212348461, + "learning_rate": 9.285714285714286e-05, + "loss": 0.0209, + "step": 52 + }, + { + "epoch": 0.47586980920314254, + "grad_norm": 0.02334459312260151, + "learning_rate": 9.464285714285715e-05, + "loss": 0.0197, + "step": 53 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.03568955883383751, + "learning_rate": 9.642857142857143e-05, + "loss": 0.018, + "step": 54 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 0.03706267848610878, + "learning_rate": 9.821428571428572e-05, + "loss": 0.0196, + "step": 55 + }, + { + "epoch": 0.49382716049382713, + "eval_loss": 0.01931421272456646, + "eval_runtime": 6.2564, + "eval_samples_per_second": 7.992, + "eval_steps_per_second": 2.078, + "step": 55 + }, + { + "epoch": 0.5028058361391695, + "grad_norm": 0.033131491392850876, + "learning_rate": 0.0001, + "loss": 0.0195, + "step": 56 + }, + { + "epoch": 0.5117845117845118, + "grad_norm": 0.025286365300416946, + "learning_rate": 9.999900908311602e-05, + "loss": 0.0195, + "step": 57 + }, + { + "epoch": 0.5207631874298541, + "grad_norm": 0.03591889888048172, + "learning_rate": 9.999603637174071e-05, + "loss": 0.0195, + "step": 58 + }, + { + "epoch": 0.5297418630751964, + "grad_norm": 0.025091370567679405, + "learning_rate": 9.999108198370249e-05, + "loss": 0.0195, + "step": 59 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 0.030630730092525482, + "learning_rate": 9.998414611537681e-05, + "loss": 0.0173, + "step": 60 + }, + { + "epoch": 0.5387205387205387, + "eval_loss": 0.019306931644678116, + "eval_runtime": 6.2911, + "eval_samples_per_second": 7.948, + "eval_steps_per_second": 2.066, + "step": 60 + }, + { + "epoch": 0.547699214365881, + "grad_norm": 0.03352862969040871, + "learning_rate": 9.997522904167844e-05, + "loss": 0.0199, + "step": 61 + }, + { + "epoch": 0.5566778900112234, + "grad_norm": 0.029540032148361206, + "learning_rate": 9.996433111605052e-05, + "loss": 0.0211, + "step": 62 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 0.03420255333185196, + "learning_rate": 9.995145277045061e-05, + "loss": 0.0181, + "step": 63 + }, + { + "epoch": 0.574635241301908, + "grad_norm": 0.026620274409651756, + "learning_rate": 9.993659451533353e-05, + "loss": 0.0206, + "step": 64 + }, + { + "epoch": 0.5836139169472503, + "grad_norm": 0.02483481727540493, + "learning_rate": 9.991975693963107e-05, + "loss": 0.0184, + "step": 65 + }, + { + "epoch": 0.5836139169472503, + "eval_loss": 0.019347084686160088, + "eval_runtime": 6.2549, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 2.078, + "step": 65 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.0339241661131382, + "learning_rate": 9.990094071072877e-05, + "loss": 0.0193, + "step": 66 + }, + { + "epoch": 0.6015712682379349, + "grad_norm": 0.0241928081959486, + "learning_rate": 9.988014657443941e-05, + "loss": 0.0193, + "step": 67 + }, + { + "epoch": 0.6105499438832772, + "grad_norm": 0.029822858050465584, + "learning_rate": 9.985737535497337e-05, + "loss": 0.0184, + "step": 68 + }, + { + "epoch": 0.6195286195286195, + "grad_norm": 0.023182636126875877, + "learning_rate": 9.983262795490613e-05, + "loss": 0.0183, + "step": 69 + }, + { + "epoch": 0.6285072951739619, + "grad_norm": 0.021179169416427612, + "learning_rate": 9.980590535514233e-05, + "loss": 0.0194, + "step": 70 + }, + { + "epoch": 0.6285072951739619, + "eval_loss": 0.01905701868236065, + "eval_runtime": 6.2492, + "eval_samples_per_second": 8.001, + "eval_steps_per_second": 2.08, + "step": 70 + }, + { + "epoch": 0.6374859708193041, + "grad_norm": 0.02753937430679798, + "learning_rate": 9.9777208614877e-05, + "loss": 0.0189, + "step": 71 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 0.027653615921735764, + "learning_rate": 9.97465388715535e-05, + "loss": 0.0191, + "step": 72 + }, + { + "epoch": 0.6554433221099888, + "grad_norm": 0.020188456401228905, + "learning_rate": 9.971389734081848e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.6644219977553311, + "grad_norm": 0.029147446155548096, + "learning_rate": 9.967928531647374e-05, + "loss": 0.0177, + "step": 74 + }, + { + "epoch": 0.6734006734006734, + "grad_norm": 0.023492921143770218, + "learning_rate": 9.96427041704248e-05, + "loss": 0.0182, + "step": 75 + }, + { + "epoch": 0.6734006734006734, + "eval_loss": 0.018457170575857162, + "eval_runtime": 6.2525, + "eval_samples_per_second": 7.997, + "eval_steps_per_second": 2.079, + "step": 75 + }, + { + "epoch": 0.6823793490460157, + "grad_norm": 0.016839003190398216, + "learning_rate": 9.960415535262671e-05, + "loss": 0.0169, + "step": 76 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 0.024092335253953934, + "learning_rate": 9.956364039102642e-05, + "loss": 0.0184, + "step": 77 + }, + { + "epoch": 0.7003367003367004, + "grad_norm": 0.026985522359609604, + "learning_rate": 9.952116089150232e-05, + "loss": 0.0187, + "step": 78 + }, + { + "epoch": 0.7093153759820426, + "grad_norm": 0.01973740942776203, + "learning_rate": 9.947671853780054e-05, + "loss": 0.0166, + "step": 79 + }, + { + "epoch": 0.7182940516273849, + "grad_norm": 0.01972176879644394, + "learning_rate": 9.943031509146825e-05, + "loss": 0.0169, + "step": 80 + }, + { + "epoch": 0.7182940516273849, + "eval_loss": 0.01826069876551628, + "eval_runtime": 6.2512, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.08, + "step": 80 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.020164869725704193, + "learning_rate": 9.938195239178374e-05, + "loss": 0.0172, + "step": 81 + }, + { + "epoch": 0.7362514029180696, + "grad_norm": 0.02163533680140972, + "learning_rate": 9.933163235568367e-05, + "loss": 0.0183, + "step": 82 + }, + { + "epoch": 0.745230078563412, + "grad_norm": 0.018531063571572304, + "learning_rate": 9.927935697768698e-05, + "loss": 0.0171, + "step": 83 + }, + { + "epoch": 0.7542087542087542, + "grad_norm": 0.027429502457380295, + "learning_rate": 9.922512832981584e-05, + "loss": 0.0214, + "step": 84 + }, + { + "epoch": 0.7631874298540965, + "grad_norm": 0.019089698791503906, + "learning_rate": 9.916894856151357e-05, + "loss": 0.0176, + "step": 85 + }, + { + "epoch": 0.7631874298540965, + "eval_loss": 0.017831869423389435, + "eval_runtime": 6.2838, + "eval_samples_per_second": 7.957, + "eval_steps_per_second": 2.069, + "step": 85 + }, + { + "epoch": 0.7721661054994389, + "grad_norm": 0.018925415351986885, + "learning_rate": 9.91108198995594e-05, + "loss": 0.0156, + "step": 86 + }, + { + "epoch": 0.7811447811447811, + "grad_norm": 0.021234937012195587, + "learning_rate": 9.905074464798024e-05, + "loss": 0.0186, + "step": 87 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 0.018775297328829765, + "learning_rate": 9.898872518795932e-05, + "loss": 0.0163, + "step": 88 + }, + { + "epoch": 0.7991021324354658, + "grad_norm": 0.016489777714014053, + "learning_rate": 9.892476397774186e-05, + "loss": 0.0173, + "step": 89 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.018042977899312973, + "learning_rate": 9.885886355253758e-05, + "loss": 0.0158, + "step": 90 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.017583945766091347, + "eval_runtime": 6.2497, + "eval_samples_per_second": 8.0, + "eval_steps_per_second": 2.08, + "step": 90 + }, + { + "epoch": 0.8170594837261503, + "grad_norm": 0.020747726783156395, + "learning_rate": 9.879102652442024e-05, + "loss": 0.0178, + "step": 91 + }, + { + "epoch": 0.8260381593714927, + "grad_norm": 0.024066736921668053, + "learning_rate": 9.872125558222409e-05, + "loss": 0.0158, + "step": 92 + }, + { + "epoch": 0.835016835016835, + "grad_norm": 0.015819448977708817, + "learning_rate": 9.864955349143734e-05, + "loss": 0.0162, + "step": 93 + }, + { + "epoch": 0.8439955106621774, + "grad_norm": 0.018356909975409508, + "learning_rate": 9.857592309409247e-05, + "loss": 0.0154, + "step": 94 + }, + { + "epoch": 0.8529741863075196, + "grad_norm": 0.027337217703461647, + "learning_rate": 9.850036730865364e-05, + "loss": 0.02, + "step": 95 + }, + { + "epoch": 0.8529741863075196, + "eval_loss": 0.017212502658367157, + "eval_runtime": 6.3032, + "eval_samples_per_second": 7.932, + "eval_steps_per_second": 2.062, + "step": 95 + }, + { + "epoch": 0.8619528619528619, + "grad_norm": 0.019369499757885933, + "learning_rate": 9.842288912990096e-05, + "loss": 0.0194, + "step": 96 + }, + { + "epoch": 0.8709315375982043, + "grad_norm": 0.0162061657756567, + "learning_rate": 9.83434916288119e-05, + "loss": 0.0152, + "step": 97 + }, + { + "epoch": 0.8799102132435466, + "grad_norm": 0.023736393079161644, + "learning_rate": 9.82621779524394e-05, + "loss": 0.017, + "step": 98 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.01927589811384678, + "learning_rate": 9.817895132378725e-05, + "loss": 0.0158, + "step": 99 + }, + { + "epoch": 0.8978675645342312, + "grad_norm": 0.025313647463917732, + "learning_rate": 9.809381504168234e-05, + "loss": 0.0165, + "step": 100 + }, + { + "epoch": 0.8978675645342312, + "eval_loss": 0.017287207767367363, + "eval_runtime": 6.2491, + "eval_samples_per_second": 8.001, + "eval_steps_per_second": 2.08, + "step": 100 + }, + { + "epoch": 0.9068462401795735, + "grad_norm": 0.021926885470747948, + "learning_rate": 9.800677248064382e-05, + "loss": 0.0169, + "step": 101 + }, + { + "epoch": 0.9158249158249159, + "grad_norm": 0.02485627494752407, + "learning_rate": 9.791782709074944e-05, + "loss": 0.0152, + "step": 102 + }, + { + "epoch": 0.9248035914702581, + "grad_norm": 0.021951181814074516, + "learning_rate": 9.782698239749873e-05, + "loss": 0.017, + "step": 103 + }, + { + "epoch": 0.9337822671156004, + "grad_norm": 0.02424493059515953, + "learning_rate": 9.77342420016733e-05, + "loss": 0.0172, + "step": 104 + }, + { + "epoch": 0.9427609427609428, + "grad_norm": 0.028901271522045135, + "learning_rate": 9.763960957919413e-05, + "loss": 0.0181, + "step": 105 + }, + { + "epoch": 0.9427609427609428, + "eval_loss": 0.016786355525255203, + "eval_runtime": 6.2498, + "eval_samples_per_second": 8.0, + "eval_steps_per_second": 2.08, + "step": 105 + }, + { + "epoch": 0.9517396184062851, + "grad_norm": 0.017801107838749886, + "learning_rate": 9.754308888097583e-05, + "loss": 0.0165, + "step": 106 + }, + { + "epoch": 0.9607182940516273, + "grad_norm": 0.037076808512210846, + "learning_rate": 9.744468373277797e-05, + "loss": 0.0162, + "step": 107 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.019457461312413216, + "learning_rate": 9.734439803505345e-05, + "loss": 0.0158, + "step": 108 + }, + { + "epoch": 0.978675645342312, + "grad_norm": 0.02191469632089138, + "learning_rate": 9.724223576279395e-05, + "loss": 0.0163, + "step": 109 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 0.024242915213108063, + "learning_rate": 9.713820096537225e-05, + "loss": 0.0176, + "step": 110 + }, + { + "epoch": 0.9876543209876543, + "eval_loss": 0.016797909513115883, + "eval_runtime": 6.2548, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 2.078, + "step": 110 + }, + { + "epoch": 0.9966329966329966, + "grad_norm": 0.02136295475065708, + "learning_rate": 9.703229776638185e-05, + "loss": 0.0166, + "step": 111 + }, + { + "epoch": 1.0078563411896746, + "grad_norm": 0.08904732018709183, + "learning_rate": 9.692453036347351e-05, + "loss": 0.0323, + "step": 112 + }, + { + "epoch": 1.0168350168350169, + "grad_norm": 0.031474873423576355, + "learning_rate": 9.681490302818874e-05, + "loss": 0.0159, + "step": 113 + }, + { + "epoch": 1.0258136924803591, + "grad_norm": 0.02854473888874054, + "learning_rate": 9.670342010579065e-05, + "loss": 0.0141, + "step": 114 + }, + { + "epoch": 1.0347923681257014, + "grad_norm": 0.029452061280608177, + "learning_rate": 9.659008601509168e-05, + "loss": 0.0184, + "step": 115 + }, + { + "epoch": 1.0347923681257014, + "eval_loss": 0.0183156318962574, + "eval_runtime": 6.2458, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.081, + "step": 115 + }, + { + "epoch": 1.0437710437710437, + "grad_norm": 0.02467503771185875, + "learning_rate": 9.647490524827834e-05, + "loss": 0.0157, + "step": 116 + }, + { + "epoch": 1.0527497194163862, + "grad_norm": 0.021488968282938004, + "learning_rate": 9.635788237073334e-05, + "loss": 0.0152, + "step": 117 + }, + { + "epoch": 1.0617283950617284, + "grad_norm": 0.02372926101088524, + "learning_rate": 9.623902202085444e-05, + "loss": 0.0176, + "step": 118 + }, + { + "epoch": 1.0707070707070707, + "grad_norm": 0.01808401569724083, + "learning_rate": 9.611832890987076e-05, + "loss": 0.0156, + "step": 119 + }, + { + "epoch": 1.079685746352413, + "grad_norm": 0.021259043365716934, + "learning_rate": 9.599580782165598e-05, + "loss": 0.0162, + "step": 120 + }, + { + "epoch": 1.079685746352413, + "eval_loss": 0.017910869792103767, + "eval_runtime": 6.2515, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.079, + "step": 120 + }, + { + "epoch": 1.0886644219977553, + "grad_norm": 0.024560727179050446, + "learning_rate": 9.587146361253868e-05, + "loss": 0.0161, + "step": 121 + }, + { + "epoch": 1.0976430976430978, + "grad_norm": 0.02424251101911068, + "learning_rate": 9.57453012111099e-05, + "loss": 0.0178, + "step": 122 + }, + { + "epoch": 1.10662177328844, + "grad_norm": 0.021675804629921913, + "learning_rate": 9.561732561802778e-05, + "loss": 0.017, + "step": 123 + }, + { + "epoch": 1.1156004489337823, + "grad_norm": 0.019624771550297737, + "learning_rate": 9.548754190581939e-05, + "loss": 0.017, + "step": 124 + }, + { + "epoch": 1.1245791245791246, + "grad_norm": 0.019135547801852226, + "learning_rate": 9.53559552186796e-05, + "loss": 0.017, + "step": 125 + }, + { + "epoch": 1.1245791245791246, + "eval_loss": 0.016845999285578728, + "eval_runtime": 6.2653, + "eval_samples_per_second": 7.98, + "eval_steps_per_second": 2.075, + "step": 125 + }, + { + "epoch": 1.1335578002244668, + "grad_norm": 0.012162208557128906, + "learning_rate": 9.522257077226717e-05, + "loss": 0.0123, + "step": 126 + }, + { + "epoch": 1.142536475869809, + "grad_norm": 0.017864948138594627, + "learning_rate": 9.508739385349812e-05, + "loss": 0.017, + "step": 127 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 0.017155688256025314, + "learning_rate": 9.49504298203361e-05, + "loss": 0.0156, + "step": 128 + }, + { + "epoch": 1.1604938271604939, + "grad_norm": 0.016472933813929558, + "learning_rate": 9.481168410158003e-05, + "loss": 0.0154, + "step": 129 + }, + { + "epoch": 1.1694725028058361, + "grad_norm": 0.014728706330060959, + "learning_rate": 9.467116219664894e-05, + "loss": 0.0143, + "step": 130 + }, + { + "epoch": 1.1694725028058361, + "eval_loss": 0.016655858606100082, + "eval_runtime": 6.2511, + "eval_samples_per_second": 7.999, + "eval_steps_per_second": 2.08, + "step": 130 + }, + { + "epoch": 1.1784511784511784, + "grad_norm": 0.014423094689846039, + "learning_rate": 9.45288696753639e-05, + "loss": 0.015, + "step": 131 + }, + { + "epoch": 1.1874298540965207, + "grad_norm": 0.01742757484316826, + "learning_rate": 9.438481217772744e-05, + "loss": 0.016, + "step": 132 + }, + { + "epoch": 1.1964085297418632, + "grad_norm": 0.01627536118030548, + "learning_rate": 9.423899541369978e-05, + "loss": 0.0131, + "step": 133 + }, + { + "epoch": 1.2053872053872055, + "grad_norm": 0.017055079340934753, + "learning_rate": 9.409142516297269e-05, + "loss": 0.016, + "step": 134 + }, + { + "epoch": 1.2143658810325477, + "grad_norm": 0.01879395917057991, + "learning_rate": 9.394210727474028e-05, + "loss": 0.0177, + "step": 135 + }, + { + "epoch": 1.2143658810325477, + "eval_loss": 0.016584472730755806, + "eval_runtime": 6.2484, + "eval_samples_per_second": 8.002, + "eval_steps_per_second": 2.081, + "step": 135 + }, + { + "epoch": 1.22334455667789, + "grad_norm": 0.01821809820830822, + "learning_rate": 9.379104766746722e-05, + "loss": 0.0163, + "step": 136 + }, + { + "epoch": 1.2323232323232323, + "grad_norm": 0.018335649743676186, + "learning_rate": 9.363825232865413e-05, + "loss": 0.0138, + "step": 137 + }, + { + "epoch": 1.2413019079685745, + "grad_norm": 0.014240071177482605, + "learning_rate": 9.348372731460023e-05, + "loss": 0.0119, + "step": 138 + }, + { + "epoch": 1.250280583613917, + "grad_norm": 0.02089606784284115, + "learning_rate": 9.332747875016332e-05, + "loss": 0.0166, + "step": 139 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 0.023216476663947105, + "learning_rate": 9.316951282851707e-05, + "loss": 0.0138, + "step": 140 + }, + { + "epoch": 1.2592592592592593, + "eval_loss": 0.01608719676733017, + "eval_runtime": 6.3168, + "eval_samples_per_second": 7.915, + "eval_steps_per_second": 2.058, + "step": 140 + }, + { + "epoch": 1.2682379349046016, + "grad_norm": 0.01914984919130802, + "learning_rate": 9.300983581090541e-05, + "loss": 0.0157, + "step": 141 + }, + { + "epoch": 1.2772166105499438, + "grad_norm": 0.020874816924333572, + "learning_rate": 9.284845402639446e-05, + "loss": 0.0142, + "step": 142 + }, + { + "epoch": 1.2861952861952861, + "grad_norm": 0.017665155231952667, + "learning_rate": 9.26853738716216e-05, + "loss": 0.0148, + "step": 143 + }, + { + "epoch": 1.2951739618406286, + "grad_norm": 0.01927882246673107, + "learning_rate": 9.2520601810542e-05, + "loss": 0.0153, + "step": 144 + }, + { + "epoch": 1.3041526374859709, + "grad_norm": 0.018328847363591194, + "learning_rate": 9.235414437417234e-05, + "loss": 0.0149, + "step": 145 + }, + { + "epoch": 1.3041526374859709, + "eval_loss": 0.01574772223830223, + "eval_runtime": 6.2751, + "eval_samples_per_second": 7.968, + "eval_steps_per_second": 2.072, + "step": 145 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.016565755009651184, + "learning_rate": 9.2186008160332e-05, + "loss": 0.0124, + "step": 146 + }, + { + "epoch": 1.3221099887766554, + "grad_norm": 0.023094868287444115, + "learning_rate": 9.201619983338153e-05, + "loss": 0.0188, + "step": 147 + }, + { + "epoch": 1.3310886644219977, + "grad_norm": 0.022368893027305603, + "learning_rate": 9.18447261239584e-05, + "loss": 0.0128, + "step": 148 + }, + { + "epoch": 1.34006734006734, + "grad_norm": 0.013615472242236137, + "learning_rate": 9.167159382871039e-05, + "loss": 0.0142, + "step": 149 + }, + { + "epoch": 1.3490460157126825, + "grad_norm": 0.021693168208003044, + "learning_rate": 9.149680981002609e-05, + "loss": 0.0162, + "step": 150 + }, + { + "epoch": 1.3490460157126825, + "eval_loss": 0.0159834623336792, + "eval_runtime": 6.2576, + "eval_samples_per_second": 7.99, + "eval_steps_per_second": 2.077, + "step": 150 + }, + { + "epoch": 1.3580246913580247, + "grad_norm": 0.016352703794836998, + "learning_rate": 9.13203809957629e-05, + "loss": 0.0149, + "step": 151 + }, + { + "epoch": 1.367003367003367, + "grad_norm": 0.019222285598516464, + "learning_rate": 9.114231437897244e-05, + "loss": 0.0166, + "step": 152 + }, + { + "epoch": 1.3759820426487093, + "grad_norm": 0.014903879724442959, + "learning_rate": 9.096261701762342e-05, + "loss": 0.0146, + "step": 153 + }, + { + "epoch": 1.3849607182940518, + "grad_norm": 0.01564696989953518, + "learning_rate": 9.078129603432181e-05, + "loss": 0.0141, + "step": 154 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 0.020336154848337173, + "learning_rate": 9.059835861602853e-05, + "loss": 0.0148, + "step": 155 + }, + { + "epoch": 1.393939393939394, + "eval_loss": 0.015588033944368362, + "eval_runtime": 6.2569, + "eval_samples_per_second": 7.991, + "eval_steps_per_second": 2.078, + "step": 155 + }, + { + "epoch": 1.4029180695847363, + "grad_norm": 0.01628415659070015, + "learning_rate": 9.041381201377468e-05, + "loss": 0.0152, + "step": 156 + }, + { + "epoch": 1.4118967452300786, + "grad_norm": 0.01812385767698288, + "learning_rate": 9.0227663542374e-05, + "loss": 0.0146, + "step": 157 + }, + { + "epoch": 1.4208754208754208, + "grad_norm": 0.023446347564458847, + "learning_rate": 9.003992058013302e-05, + "loss": 0.015, + "step": 158 + }, + { + "epoch": 1.4298540965207631, + "grad_norm": 0.02031407319009304, + "learning_rate": 8.985059056855858e-05, + "loss": 0.0144, + "step": 159 + }, + { + "epoch": 1.4388327721661054, + "grad_norm": 0.022712191566824913, + "learning_rate": 8.965968101206291e-05, + "loss": 0.0168, + "step": 160 + }, + { + "epoch": 1.4388327721661054, + "eval_loss": 0.015444611199200153, + "eval_runtime": 6.2545, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 2.079, + "step": 160 + }, + { + "epoch": 1.4478114478114479, + "grad_norm": 0.018817342817783356, + "learning_rate": 8.94671994776661e-05, + "loss": 0.0147, + "step": 161 + }, + { + "epoch": 1.4567901234567902, + "grad_norm": 0.014623799361288548, + "learning_rate": 8.927315359469626e-05, + "loss": 0.0129, + "step": 162 + }, + { + "epoch": 1.4657687991021324, + "grad_norm": 0.02344674989581108, + "learning_rate": 8.907755105448704e-05, + "loss": 0.019, + "step": 163 + }, + { + "epoch": 1.4747474747474747, + "grad_norm": 0.02390502393245697, + "learning_rate": 8.888039961007282e-05, + "loss": 0.0157, + "step": 164 + }, + { + "epoch": 1.4837261503928172, + "grad_norm": 0.017417486757040024, + "learning_rate": 8.868170707588142e-05, + "loss": 0.0148, + "step": 165 + }, + { + "epoch": 1.4837261503928172, + "eval_loss": 0.015270690433681011, + "eval_runtime": 6.2526, + "eval_samples_per_second": 7.997, + "eval_steps_per_second": 2.079, + "step": 165 + }, + { + "epoch": 1.4927048260381595, + "grad_norm": 0.01864814944565296, + "learning_rate": 8.848148132742431e-05, + "loss": 0.0133, + "step": 166 + }, + { + "epoch": 1.5016835016835017, + "grad_norm": 0.014610537327826023, + "learning_rate": 8.827973030098448e-05, + "loss": 0.0135, + "step": 167 + }, + { + "epoch": 1.510662177328844, + "grad_norm": 0.0179497878998518, + "learning_rate": 8.807646199330187e-05, + "loss": 0.0159, + "step": 168 + }, + { + "epoch": 1.5196408529741863, + "grad_norm": 0.024205263704061508, + "learning_rate": 8.787168446125638e-05, + "loss": 0.0129, + "step": 169 + }, + { + "epoch": 1.5286195286195285, + "grad_norm": 0.018040824681520462, + "learning_rate": 8.766540582154859e-05, + "loss": 0.0146, + "step": 170 + }, + { + "epoch": 1.5286195286195285, + "eval_loss": 0.015371887013316154, + "eval_runtime": 6.2476, + "eval_samples_per_second": 8.003, + "eval_steps_per_second": 2.081, + "step": 170 + }, + { + "epoch": 1.5375982042648708, + "grad_norm": 0.017247065901756287, + "learning_rate": 8.745763425037797e-05, + "loss": 0.015, + "step": 171 + }, + { + "epoch": 1.546576879910213, + "grad_norm": 0.019299406558275223, + "learning_rate": 8.724837798311882e-05, + "loss": 0.0153, + "step": 172 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.015991205349564552, + "learning_rate": 8.703764531399392e-05, + "loss": 0.0122, + "step": 173 + }, + { + "epoch": 1.5645342312008979, + "grad_norm": 0.015190811827778816, + "learning_rate": 8.682544459574562e-05, + "loss": 0.0144, + "step": 174 + }, + { + "epoch": 1.5735129068462403, + "grad_norm": 0.01828867383301258, + "learning_rate": 8.661178423930491e-05, + "loss": 0.0137, + "step": 175 + }, + { + "epoch": 1.5735129068462403, + "eval_loss": 0.014978926628828049, + "eval_runtime": 6.2611, + "eval_samples_per_second": 7.986, + "eval_steps_per_second": 2.076, + "step": 175 + }, + { + "epoch": 1.5824915824915826, + "grad_norm": 0.01734633930027485, + "learning_rate": 8.639667271345798e-05, + "loss": 0.0177, + "step": 176 + }, + { + "epoch": 1.5914702581369249, + "grad_norm": 0.019932016730308533, + "learning_rate": 8.618011854451056e-05, + "loss": 0.0115, + "step": 177 + }, + { + "epoch": 1.6004489337822672, + "grad_norm": 0.026310300454497337, + "learning_rate": 8.596213031594991e-05, + "loss": 0.0167, + "step": 178 + }, + { + "epoch": 1.6094276094276094, + "grad_norm": 0.024424167349934578, + "learning_rate": 8.57427166681047e-05, + "loss": 0.0153, + "step": 179 + }, + { + "epoch": 1.6184062850729517, + "grad_norm": 0.016459695994853973, + "learning_rate": 8.552188629780244e-05, + "loss": 0.0144, + "step": 180 + }, + { + "epoch": 1.6184062850729517, + "eval_loss": 0.014994567260146141, + "eval_runtime": 6.2493, + "eval_samples_per_second": 8.001, + "eval_steps_per_second": 2.08, + "step": 180 + }, + { + "epoch": 1.627384960718294, + "grad_norm": 0.021496936678886414, + "learning_rate": 8.529964795802485e-05, + "loss": 0.0125, + "step": 181 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.019307784736156464, + "learning_rate": 8.507601045756085e-05, + "loss": 0.0152, + "step": 182 + }, + { + "epoch": 1.6453423120089785, + "grad_norm": 0.016401201486587524, + "learning_rate": 8.485098266065744e-05, + "loss": 0.0125, + "step": 183 + }, + { + "epoch": 1.654320987654321, + "grad_norm": 0.023525064811110497, + "learning_rate": 8.462457348666835e-05, + "loss": 0.0163, + "step": 184 + }, + { + "epoch": 1.6632996632996633, + "grad_norm": 0.020476635545492172, + "learning_rate": 8.439679190970052e-05, + "loss": 0.0129, + "step": 185 + }, + { + "epoch": 1.6632996632996633, + "eval_loss": 0.014757846482098103, + "eval_runtime": 6.3022, + "eval_samples_per_second": 7.934, + "eval_steps_per_second": 2.063, + "step": 185 + }, + { + "epoch": 1.6722783389450058, + "grad_norm": 0.027425814419984818, + "learning_rate": 8.416764695825835e-05, + "loss": 0.015, + "step": 186 + }, + { + "epoch": 1.681257014590348, + "grad_norm": 0.019989849999547005, + "learning_rate": 8.39371477148859e-05, + "loss": 0.0166, + "step": 187 + }, + { + "epoch": 1.6902356902356903, + "grad_norm": 0.015978503972291946, + "learning_rate": 8.370530331580686e-05, + "loss": 0.0131, + "step": 188 + }, + { + "epoch": 1.6992143658810326, + "grad_norm": 0.02169989049434662, + "learning_rate": 8.347212295056239e-05, + "loss": 0.0158, + "step": 189 + }, + { + "epoch": 1.7081930415263749, + "grad_norm": 0.02388261817395687, + "learning_rate": 8.323761586164695e-05, + "loss": 0.0139, + "step": 190 + }, + { + "epoch": 1.7081930415263749, + "eval_loss": 0.014534353278577328, + "eval_runtime": 6.2541, + "eval_samples_per_second": 7.995, + "eval_steps_per_second": 2.079, + "step": 190 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.014390088617801666, + "learning_rate": 8.300179134414188e-05, + "loss": 0.0116, + "step": 191 + }, + { + "epoch": 1.7261503928170594, + "grad_norm": 0.022068368270993233, + "learning_rate": 8.276465874534702e-05, + "loss": 0.0127, + "step": 192 + }, + { + "epoch": 1.7351290684624017, + "grad_norm": 0.020960543304681778, + "learning_rate": 8.252622746441021e-05, + "loss": 0.0142, + "step": 193 + }, + { + "epoch": 1.7441077441077442, + "grad_norm": 0.01721160300076008, + "learning_rate": 8.228650695195472e-05, + "loss": 0.0151, + "step": 194 + }, + { + "epoch": 1.7530864197530864, + "grad_norm": 0.014986108988523483, + "learning_rate": 8.204550670970469e-05, + "loss": 0.013, + "step": 195 + }, + { + "epoch": 1.7530864197530864, + "eval_loss": 0.014544461853802204, + "eval_runtime": 6.2711, + "eval_samples_per_second": 7.973, + "eval_steps_per_second": 2.073, + "step": 195 + }, + { + "epoch": 1.7620650953984287, + "grad_norm": 0.01918155886232853, + "learning_rate": 8.180323629010848e-05, + "loss": 0.0146, + "step": 196 + }, + { + "epoch": 1.7710437710437712, + "grad_norm": 0.015731407329440117, + "learning_rate": 8.155970529596006e-05, + "loss": 0.0147, + "step": 197 + }, + { + "epoch": 1.7800224466891135, + "grad_norm": 0.01721978560090065, + "learning_rate": 8.131492338001839e-05, + "loss": 0.0132, + "step": 198 + }, + { + "epoch": 1.7890011223344557, + "grad_norm": 0.019031619653105736, + "learning_rate": 8.106890024462481e-05, + "loss": 0.0149, + "step": 199 + }, + { + "epoch": 1.797979797979798, + "grad_norm": 0.017630157992243767, + "learning_rate": 8.082164564131845e-05, + "loss": 0.013, + "step": 200 + }, + { + "epoch": 1.797979797979798, + "eval_loss": 0.01437403354793787, + "eval_runtime": 6.2492, + "eval_samples_per_second": 8.001, + "eval_steps_per_second": 2.08, + "step": 200 + }, + { + "epoch": 1.8069584736251403, + "grad_norm": 0.02445027232170105, + "learning_rate": 8.057316937044977e-05, + "loss": 0.018, + "step": 201 + }, + { + "epoch": 1.8159371492704826, + "grad_norm": 0.021661758422851562, + "learning_rate": 8.032348128079203e-05, + "loss": 0.0151, + "step": 202 + }, + { + "epoch": 1.8249158249158248, + "grad_norm": 0.01885199546813965, + "learning_rate": 8.0072591269151e-05, + "loss": 0.0135, + "step": 203 + }, + { + "epoch": 1.833894500561167, + "grad_norm": 0.020552242174744606, + "learning_rate": 7.982050927997264e-05, + "loss": 0.0141, + "step": 204 + }, + { + "epoch": 1.8428731762065096, + "grad_norm": 0.016624536365270615, + "learning_rate": 7.956724530494887e-05, + "loss": 0.0124, + "step": 205 + }, + { + "epoch": 1.8428731762065096, + "eval_loss": 0.014373213052749634, + "eval_runtime": 6.2577, + "eval_samples_per_second": 7.99, + "eval_steps_per_second": 2.077, + "step": 205 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.023108718916773796, + "learning_rate": 7.931280938262169e-05, + "loss": 0.0166, + "step": 206 + }, + { + "epoch": 1.8608305274971941, + "grad_norm": 0.014059687964618206, + "learning_rate": 7.905721159798513e-05, + "loss": 0.0129, + "step": 207 + }, + { + "epoch": 1.8698092031425366, + "grad_norm": 0.019422784447669983, + "learning_rate": 7.880046208208563e-05, + "loss": 0.0149, + "step": 208 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 0.01858607865869999, + "learning_rate": 7.854257101162037e-05, + "loss": 0.0134, + "step": 209 + }, + { + "epoch": 1.8877665544332212, + "grad_norm": 0.019326094537973404, + "learning_rate": 7.828354860853399e-05, + "loss": 0.0135, + "step": 210 + }, + { + "epoch": 1.8877665544332212, + "eval_loss": 0.014275193214416504, + "eval_runtime": 6.2658, + "eval_samples_per_second": 7.98, + "eval_steps_per_second": 2.075, + "step": 210 + }, + { + "epoch": 1.8967452300785634, + "grad_norm": 0.013561426661908627, + "learning_rate": 7.802340513961342e-05, + "loss": 0.012, + "step": 211 + }, + { + "epoch": 1.9057239057239057, + "grad_norm": 0.01908908411860466, + "learning_rate": 7.776215091608085e-05, + "loss": 0.0132, + "step": 212 + }, + { + "epoch": 1.914702581369248, + "grad_norm": 0.016201447695493698, + "learning_rate": 7.749979629318516e-05, + "loss": 0.0126, + "step": 213 + }, + { + "epoch": 1.9236812570145903, + "grad_norm": 0.015391174703836441, + "learning_rate": 7.723635166979133e-05, + "loss": 0.0135, + "step": 214 + }, + { + "epoch": 1.9326599326599325, + "grad_norm": 0.016166241839528084, + "learning_rate": 7.697182748796841e-05, + "loss": 0.0128, + "step": 215 + }, + { + "epoch": 1.9326599326599325, + "eval_loss": 0.014707864262163639, + "eval_runtime": 6.2641, + "eval_samples_per_second": 7.982, + "eval_steps_per_second": 2.075, + "step": 215 + }, + { + "epoch": 1.941638608305275, + "grad_norm": 0.02329264022409916, + "learning_rate": 7.670623423257548e-05, + "loss": 0.0131, + "step": 216 + }, + { + "epoch": 1.9506172839506173, + "grad_norm": 0.01891166903078556, + "learning_rate": 7.64395824308462e-05, + "loss": 0.0137, + "step": 217 + }, + { + "epoch": 1.9595959595959596, + "grad_norm": 0.015786990523338318, + "learning_rate": 7.617188265197148e-05, + "loss": 0.0128, + "step": 218 + }, + { + "epoch": 1.968574635241302, + "grad_norm": 0.021409448236227036, + "learning_rate": 7.590314550668054e-05, + "loss": 0.0142, + "step": 219 + }, + { + "epoch": 1.9775533108866443, + "grad_norm": 0.025052543729543686, + "learning_rate": 7.563338164682036e-05, + "loss": 0.0149, + "step": 220 + }, + { + "epoch": 1.9775533108866443, + "eval_loss": 0.01433955691754818, + "eval_runtime": 6.2639, + "eval_samples_per_second": 7.982, + "eval_steps_per_second": 2.075, + "step": 220 + }, + { + "epoch": 1.9865319865319866, + "grad_norm": 0.013938682153820992, + "learning_rate": 7.536260176493348e-05, + "loss": 0.0143, + "step": 221 + }, + { + "epoch": 1.9955106621773289, + "grad_norm": 0.01827586442232132, + "learning_rate": 7.509081659383417e-05, + "loss": 0.0134, + "step": 222 + }, + { + "epoch": 2.006734006734007, + "grad_norm": 0.04617556184530258, + "learning_rate": 7.481803690618304e-05, + "loss": 0.0255, + "step": 223 + }, + { + "epoch": 2.015712682379349, + "grad_norm": 0.01754753105342388, + "learning_rate": 7.454427351405999e-05, + "loss": 0.0154, + "step": 224 + }, + { + "epoch": 2.0246913580246915, + "grad_norm": 0.025898197665810585, + "learning_rate": 7.426953726853574e-05, + "loss": 0.0138, + "step": 225 + }, + { + "epoch": 2.0246913580246915, + "eval_loss": 0.014397691935300827, + "eval_runtime": 6.2513, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.08, + "step": 225 + }, + { + "epoch": 2.0336700336700337, + "grad_norm": 0.01992025040090084, + "learning_rate": 7.399383905924165e-05, + "loss": 0.0113, + "step": 226 + }, + { + "epoch": 2.042648709315376, + "grad_norm": 0.014167600311338902, + "learning_rate": 7.371718981393815e-05, + "loss": 0.0108, + "step": 227 + }, + { + "epoch": 2.0516273849607183, + "grad_norm": 0.021143754944205284, + "learning_rate": 7.343960049808156e-05, + "loss": 0.0136, + "step": 228 + }, + { + "epoch": 2.0606060606060606, + "grad_norm": 0.025208963081240654, + "learning_rate": 7.316108211438945e-05, + "loss": 0.0129, + "step": 229 + }, + { + "epoch": 2.069584736251403, + "grad_norm": 0.017190365120768547, + "learning_rate": 7.288164570240463e-05, + "loss": 0.0127, + "step": 230 + }, + { + "epoch": 2.069584736251403, + "eval_loss": 0.014279232360422611, + "eval_runtime": 6.2544, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 2.079, + "step": 230 + }, + { + "epoch": 2.078563411896745, + "grad_norm": 0.01911742240190506, + "learning_rate": 7.26013023380574e-05, + "loss": 0.0121, + "step": 231 + }, + { + "epoch": 2.0875420875420874, + "grad_norm": 0.022378170862793922, + "learning_rate": 7.232006313322667e-05, + "loss": 0.013, + "step": 232 + }, + { + "epoch": 2.0965207631874296, + "grad_norm": 0.020126372575759888, + "learning_rate": 7.203793923529956e-05, + "loss": 0.0127, + "step": 233 + }, + { + "epoch": 2.1054994388327724, + "grad_norm": 0.018375013023614883, + "learning_rate": 7.175494182672939e-05, + "loss": 0.0141, + "step": 234 + }, + { + "epoch": 2.1144781144781146, + "grad_norm": 0.016569405794143677, + "learning_rate": 7.147108212459257e-05, + "loss": 0.0116, + "step": 235 + }, + { + "epoch": 2.1144781144781146, + "eval_loss": 0.0142152588814497, + "eval_runtime": 6.2529, + "eval_samples_per_second": 7.996, + "eval_steps_per_second": 2.079, + "step": 235 + }, + { + "epoch": 2.123456790123457, + "grad_norm": 0.013866296038031578, + "learning_rate": 7.118637138014396e-05, + "loss": 0.011, + "step": 236 + }, + { + "epoch": 2.132435465768799, + "grad_norm": 0.0164170078933239, + "learning_rate": 7.090082087837092e-05, + "loss": 0.0137, + "step": 237 + }, + { + "epoch": 2.1414141414141414, + "grad_norm": 0.014653601683676243, + "learning_rate": 7.061444193754596e-05, + "loss": 0.012, + "step": 238 + }, + { + "epoch": 2.1503928170594837, + "grad_norm": 0.020761555060744286, + "learning_rate": 7.032724590877821e-05, + "loss": 0.0119, + "step": 239 + }, + { + "epoch": 2.159371492704826, + "grad_norm": 0.019674135372042656, + "learning_rate": 7.003924417556343e-05, + "loss": 0.0128, + "step": 240 + }, + { + "epoch": 2.159371492704826, + "eval_loss": 0.014257782138884068, + "eval_runtime": 6.307, + "eval_samples_per_second": 7.928, + "eval_steps_per_second": 2.061, + "step": 240 + }, + { + "epoch": 2.1683501683501682, + "grad_norm": 0.017319727689027786, + "learning_rate": 6.975044815333282e-05, + "loss": 0.0109, + "step": 241 + }, + { + "epoch": 2.1773288439955105, + "grad_norm": 0.02167445980012417, + "learning_rate": 6.946086928900054e-05, + "loss": 0.0132, + "step": 242 + }, + { + "epoch": 2.186307519640853, + "grad_norm": 0.016814757138490677, + "learning_rate": 6.917051906051006e-05, + "loss": 0.0106, + "step": 243 + }, + { + "epoch": 2.1952861952861955, + "grad_norm": 0.012457667849957943, + "learning_rate": 6.887940897637908e-05, + "loss": 0.0103, + "step": 244 + }, + { + "epoch": 2.204264870931538, + "grad_norm": 0.022774742916226387, + "learning_rate": 6.858755057524354e-05, + "loss": 0.0145, + "step": 245 + }, + { + "epoch": 2.204264870931538, + "eval_loss": 0.014115707948803902, + "eval_runtime": 6.2503, + "eval_samples_per_second": 8.0, + "eval_steps_per_second": 2.08, + "step": 245 + }, + { + "epoch": 2.21324354657688, + "grad_norm": 0.018725769594311714, + "learning_rate": 6.829495542540013e-05, + "loss": 0.0118, + "step": 246 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.019993796944618225, + "learning_rate": 6.80016351243478e-05, + "loss": 0.0142, + "step": 247 + }, + { + "epoch": 2.2312008978675646, + "grad_norm": 0.016696954146027565, + "learning_rate": 6.77076012983281e-05, + "loss": 0.0119, + "step": 248 + }, + { + "epoch": 2.240179573512907, + "grad_norm": 0.016207238659262657, + "learning_rate": 6.741286560186437e-05, + "loss": 0.0096, + "step": 249 + }, + { + "epoch": 2.249158249158249, + "grad_norm": 0.019286343827843666, + "learning_rate": 6.711743971729967e-05, + "loss": 0.0147, + "step": 250 + }, + { + "epoch": 2.249158249158249, + "eval_loss": 0.013896584510803223, + "eval_runtime": 6.2584, + "eval_samples_per_second": 7.989, + "eval_steps_per_second": 2.077, + "step": 250 + }, + { + "epoch": 2.2581369248035914, + "grad_norm": 0.013443589210510254, + "learning_rate": 6.682133535433393e-05, + "loss": 0.0102, + "step": 251 + }, + { + "epoch": 2.2671156004489337, + "grad_norm": 0.016910936683416367, + "learning_rate": 6.652456424955963e-05, + "loss": 0.0147, + "step": 252 + }, + { + "epoch": 2.276094276094276, + "grad_norm": 0.01767115481197834, + "learning_rate": 6.622713816599673e-05, + "loss": 0.0112, + "step": 253 + }, + { + "epoch": 2.285072951739618, + "grad_norm": 0.017718922346830368, + "learning_rate": 6.592906889262632e-05, + "loss": 0.013, + "step": 254 + }, + { + "epoch": 2.2940516273849605, + "grad_norm": 0.01557993981987238, + "learning_rate": 6.563036824392344e-05, + "loss": 0.0114, + "step": 255 + }, + { + "epoch": 2.2940516273849605, + "eval_loss": 0.013858611695468426, + "eval_runtime": 6.2582, + "eval_samples_per_second": 7.99, + "eval_steps_per_second": 2.077, + "step": 255 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.015414374880492687, + "learning_rate": 6.533104805938873e-05, + "loss": 0.0114, + "step": 256 + }, + { + "epoch": 2.3120089786756455, + "grad_norm": 0.01631319336593151, + "learning_rate": 6.503112020307916e-05, + "loss": 0.0116, + "step": 257 + }, + { + "epoch": 2.3209876543209877, + "grad_norm": 0.02034182660281658, + "learning_rate": 6.473059656313782e-05, + "loss": 0.0133, + "step": 258 + }, + { + "epoch": 2.32996632996633, + "grad_norm": 0.01835636980831623, + "learning_rate": 6.442948905132266e-05, + "loss": 0.0127, + "step": 259 + }, + { + "epoch": 2.3389450056116723, + "grad_norm": 0.01679323986172676, + "learning_rate": 6.412780960253436e-05, + "loss": 0.0114, + "step": 260 + }, + { + "epoch": 2.3389450056116723, + "eval_loss": 0.01390094868838787, + "eval_runtime": 6.2592, + "eval_samples_per_second": 7.988, + "eval_steps_per_second": 2.077, + "step": 260 + }, + { + "epoch": 2.3479236812570146, + "grad_norm": 0.016404911875724792, + "learning_rate": 6.382557017434332e-05, + "loss": 0.0122, + "step": 261 + }, + { + "epoch": 2.356902356902357, + "grad_norm": 0.012671472504734993, + "learning_rate": 6.352278274651561e-05, + "loss": 0.0091, + "step": 262 + }, + { + "epoch": 2.365881032547699, + "grad_norm": 0.01708405278623104, + "learning_rate": 6.321945932053822e-05, + "loss": 0.0125, + "step": 263 + }, + { + "epoch": 2.3748597081930414, + "grad_norm": 0.019615929573774338, + "learning_rate": 6.291561191914333e-05, + "loss": 0.0125, + "step": 264 + }, + { + "epoch": 2.3838383838383836, + "grad_norm": 0.014311819337308407, + "learning_rate": 6.261125258583171e-05, + "loss": 0.0112, + "step": 265 + }, + { + "epoch": 2.3838383838383836, + "eval_loss": 0.013734661974012852, + "eval_runtime": 6.2541, + "eval_samples_per_second": 7.995, + "eval_steps_per_second": 2.079, + "step": 265 + }, + { + "epoch": 2.3928170594837264, + "grad_norm": 0.01536188181489706, + "learning_rate": 6.230639338439549e-05, + "loss": 0.0134, + "step": 266 + }, + { + "epoch": 2.4017957351290686, + "grad_norm": 0.01723441854119301, + "learning_rate": 6.200104639843985e-05, + "loss": 0.0125, + "step": 267 + }, + { + "epoch": 2.410774410774411, + "grad_norm": 0.013469617813825607, + "learning_rate": 6.169522373090412e-05, + "loss": 0.0117, + "step": 268 + }, + { + "epoch": 2.419753086419753, + "grad_norm": 0.016679449006915092, + "learning_rate": 6.138893750358212e-05, + "loss": 0.012, + "step": 269 + }, + { + "epoch": 2.4287317620650954, + "grad_norm": 0.014688557013869286, + "learning_rate": 6.108219985664161e-05, + "loss": 0.0105, + "step": 270 + }, + { + "epoch": 2.4287317620650954, + "eval_loss": 0.013788803480565548, + "eval_runtime": 6.2591, + "eval_samples_per_second": 7.988, + "eval_steps_per_second": 2.077, + "step": 270 + }, + { + "epoch": 2.4377104377104377, + "grad_norm": 0.01410532183945179, + "learning_rate": 6.0775022948143115e-05, + "loss": 0.0137, + "step": 271 + }, + { + "epoch": 2.44668911335578, + "grad_norm": 0.018949788063764572, + "learning_rate": 6.046741895355802e-05, + "loss": 0.0117, + "step": 272 + }, + { + "epoch": 2.4556677890011223, + "grad_norm": 0.01841340772807598, + "learning_rate": 6.015940006528602e-05, + "loss": 0.0108, + "step": 273 + }, + { + "epoch": 2.4646464646464645, + "grad_norm": 0.015688767656683922, + "learning_rate": 5.9850978492171794e-05, + "loss": 0.011, + "step": 274 + }, + { + "epoch": 2.473625140291807, + "grad_norm": 0.017808249220252037, + "learning_rate": 5.954216645902109e-05, + "loss": 0.0129, + "step": 275 + }, + { + "epoch": 2.473625140291807, + "eval_loss": 0.01363787055015564, + "eval_runtime": 6.2939, + "eval_samples_per_second": 7.944, + "eval_steps_per_second": 2.066, + "step": 275 + }, + { + "epoch": 2.482603815937149, + "grad_norm": 0.016168171539902687, + "learning_rate": 5.923297620611623e-05, + "loss": 0.0107, + "step": 276 + }, + { + "epoch": 2.4915824915824913, + "grad_norm": 0.016473444178700447, + "learning_rate": 5.892341998873089e-05, + "loss": 0.0137, + "step": 277 + }, + { + "epoch": 2.500561167227834, + "grad_norm": 0.019177664071321487, + "learning_rate": 5.861351007664434e-05, + "loss": 0.0127, + "step": 278 + }, + { + "epoch": 2.5095398428731763, + "grad_norm": 0.01716047339141369, + "learning_rate": 5.83032587536552e-05, + "loss": 0.0127, + "step": 279 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 0.019998129457235336, + "learning_rate": 5.799267831709442e-05, + "loss": 0.014, + "step": 280 + }, + { + "epoch": 2.5185185185185186, + "eval_loss": 0.013543778099119663, + "eval_runtime": 6.2505, + "eval_samples_per_second": 7.999, + "eval_steps_per_second": 2.08, + "step": 280 + }, + { + "epoch": 2.527497194163861, + "grad_norm": 0.013933787122368813, + "learning_rate": 5.7681781077337905e-05, + "loss": 0.0096, + "step": 281 + }, + { + "epoch": 2.536475869809203, + "grad_norm": 0.015824446454644203, + "learning_rate": 5.737057935731868e-05, + "loss": 0.0092, + "step": 282 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.01764868013560772, + "learning_rate": 5.705908549203823e-05, + "loss": 0.0126, + "step": 283 + }, + { + "epoch": 2.5544332210998877, + "grad_norm": 0.022920994088053703, + "learning_rate": 5.674731182807781e-05, + "loss": 0.0122, + "step": 284 + }, + { + "epoch": 2.56341189674523, + "grad_norm": 0.01605917327105999, + "learning_rate": 5.643527072310891e-05, + "loss": 0.0124, + "step": 285 + }, + { + "epoch": 2.56341189674523, + "eval_loss": 0.013581929728388786, + "eval_runtime": 6.2549, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 2.078, + "step": 285 + }, + { + "epoch": 2.5723905723905722, + "grad_norm": 0.022532224655151367, + "learning_rate": 5.612297454540352e-05, + "loss": 0.0134, + "step": 286 + }, + { + "epoch": 2.581369248035915, + "grad_norm": 0.01806466281414032, + "learning_rate": 5.581043567334383e-05, + "loss": 0.0105, + "step": 287 + }, + { + "epoch": 2.590347923681257, + "grad_norm": 0.019253233447670937, + "learning_rate": 5.5497666494931654e-05, + "loss": 0.0116, + "step": 288 + }, + { + "epoch": 2.5993265993265995, + "grad_norm": 0.016167184337973595, + "learning_rate": 5.518467940729739e-05, + "loss": 0.0127, + "step": 289 + }, + { + "epoch": 2.6083052749719418, + "grad_norm": 0.019589709118008614, + "learning_rate": 5.487148681620862e-05, + "loss": 0.0128, + "step": 290 + }, + { + "epoch": 2.6083052749719418, + "eval_loss": 0.01327795721590519, + "eval_runtime": 6.258, + "eval_samples_per_second": 7.99, + "eval_steps_per_second": 2.077, + "step": 290 + }, + { + "epoch": 2.617283950617284, + "grad_norm": 0.017755387350916862, + "learning_rate": 5.455810113557839e-05, + "loss": 0.0129, + "step": 291 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.01898750476539135, + "learning_rate": 5.4244534786973214e-05, + "loss": 0.0113, + "step": 292 + }, + { + "epoch": 2.6352413019079686, + "grad_norm": 0.014957334846258163, + "learning_rate": 5.3930800199120616e-05, + "loss": 0.0132, + "step": 293 + }, + { + "epoch": 2.644219977553311, + "grad_norm": 0.014244873076677322, + "learning_rate": 5.361690980741663e-05, + "loss": 0.0111, + "step": 294 + }, + { + "epoch": 2.653198653198653, + "grad_norm": 0.013911189511418343, + "learning_rate": 5.330287605343279e-05, + "loss": 0.0106, + "step": 295 + }, + { + "epoch": 2.653198653198653, + "eval_loss": 0.01291807834059, + "eval_runtime": 6.2514, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.08, + "step": 295 + }, + { + "epoch": 2.6621773288439954, + "grad_norm": 0.017059607431292534, + "learning_rate": 5.298871138442307e-05, + "loss": 0.0127, + "step": 296 + }, + { + "epoch": 2.6711560044893377, + "grad_norm": 0.0140462601557374, + "learning_rate": 5.267442825283048e-05, + "loss": 0.0123, + "step": 297 + }, + { + "epoch": 2.68013468013468, + "grad_norm": 0.019492056220769882, + "learning_rate": 5.236003911579345e-05, + "loss": 0.0138, + "step": 298 + }, + { + "epoch": 2.689113355780022, + "grad_norm": 0.018796566873788834, + "learning_rate": 5.204555643465215e-05, + "loss": 0.011, + "step": 299 + }, + { + "epoch": 2.698092031425365, + "grad_norm": 0.012939135544002056, + "learning_rate": 5.173099267445451e-05, + "loss": 0.0099, + "step": 300 + }, + { + "epoch": 2.698092031425365, + "eval_loss": 0.012918239459395409, + "eval_runtime": 6.2607, + "eval_samples_per_second": 7.986, + "eval_steps_per_second": 2.076, + "step": 300 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.01702980510890484, + "learning_rate": 5.1416360303462206e-05, + "loss": 0.0116, + "step": 301 + }, + { + "epoch": 2.7160493827160495, + "grad_norm": 0.021528059616684914, + "learning_rate": 5.110167179265636e-05, + "loss": 0.0134, + "step": 302 + }, + { + "epoch": 2.7250280583613917, + "grad_norm": 0.013685944490134716, + "learning_rate": 5.078693961524329e-05, + "loss": 0.0109, + "step": 303 + }, + { + "epoch": 2.734006734006734, + "grad_norm": 0.016435936093330383, + "learning_rate": 5.0472176246160184e-05, + "loss": 0.0121, + "step": 304 + }, + { + "epoch": 2.7429854096520763, + "grad_norm": 0.019511640071868896, + "learning_rate": 5.01573941615805e-05, + "loss": 0.0111, + "step": 305 + }, + { + "epoch": 2.7429854096520763, + "eval_loss": 0.012916718609631062, + "eval_runtime": 6.27, + "eval_samples_per_second": 7.974, + "eval_steps_per_second": 2.073, + "step": 305 + }, + { + "epoch": 2.7519640852974185, + "grad_norm": 0.01732565462589264, + "learning_rate": 4.984260583841953e-05, + "loss": 0.0115, + "step": 306 + }, + { + "epoch": 2.760942760942761, + "grad_norm": 0.021635642275214195, + "learning_rate": 4.9527823753839834e-05, + "loss": 0.0135, + "step": 307 + }, + { + "epoch": 2.7699214365881035, + "grad_norm": 0.012034310959279537, + "learning_rate": 4.9213060384756716e-05, + "loss": 0.009, + "step": 308 + }, + { + "epoch": 2.778900112233446, + "grad_norm": 0.020500048995018005, + "learning_rate": 4.8898328207343666e-05, + "loss": 0.0123, + "step": 309 + }, + { + "epoch": 2.787878787878788, + "grad_norm": 0.01599585823714733, + "learning_rate": 4.858363969653781e-05, + "loss": 0.0129, + "step": 310 + }, + { + "epoch": 2.787878787878788, + "eval_loss": 0.012887900695204735, + "eval_runtime": 6.2513, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.08, + "step": 310 + }, + { + "epoch": 2.7968574635241303, + "grad_norm": 0.01829446479678154, + "learning_rate": 4.8269007325545506e-05, + "loss": 0.0127, + "step": 311 + }, + { + "epoch": 2.8058361391694726, + "grad_norm": 0.014843763783574104, + "learning_rate": 4.7954443565347865e-05, + "loss": 0.0104, + "step": 312 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 0.018556272611021996, + "learning_rate": 4.7639960884206576e-05, + "loss": 0.0132, + "step": 313 + }, + { + "epoch": 2.823793490460157, + "grad_norm": 0.016161100938916206, + "learning_rate": 4.7325571747169545e-05, + "loss": 0.0106, + "step": 314 + }, + { + "epoch": 2.8327721661054994, + "grad_norm": 0.01768597401678562, + "learning_rate": 4.7011288615576934e-05, + "loss": 0.0088, + "step": 315 + }, + { + "epoch": 2.8327721661054994, + "eval_loss": 0.012852279469370842, + "eval_runtime": 6.2725, + "eval_samples_per_second": 7.971, + "eval_steps_per_second": 2.073, + "step": 315 + }, + { + "epoch": 2.8417508417508417, + "grad_norm": 0.022071367129683495, + "learning_rate": 4.6697123946567227e-05, + "loss": 0.0159, + "step": 316 + }, + { + "epoch": 2.850729517396184, + "grad_norm": 0.012250754982233047, + "learning_rate": 4.63830901925834e-05, + "loss": 0.0098, + "step": 317 + }, + { + "epoch": 2.8597081930415262, + "grad_norm": 0.015096920542418957, + "learning_rate": 4.60691998008794e-05, + "loss": 0.011, + "step": 318 + }, + { + "epoch": 2.8686868686868685, + "grad_norm": 0.016319630667567253, + "learning_rate": 4.575546521302681e-05, + "loss": 0.0115, + "step": 319 + }, + { + "epoch": 2.877665544332211, + "grad_norm": 0.01583506353199482, + "learning_rate": 4.544189886442162e-05, + "loss": 0.0092, + "step": 320 + }, + { + "epoch": 2.877665544332211, + "eval_loss": 0.012964904308319092, + "eval_runtime": 6.2575, + "eval_samples_per_second": 7.99, + "eval_steps_per_second": 2.077, + "step": 320 + }, + { + "epoch": 2.886644219977553, + "grad_norm": 0.020128175616264343, + "learning_rate": 4.5128513183791386e-05, + "loss": 0.0141, + "step": 321 + }, + { + "epoch": 2.8956228956228958, + "grad_norm": 0.019213715568184853, + "learning_rate": 4.481532059270262e-05, + "loss": 0.0115, + "step": 322 + }, + { + "epoch": 2.904601571268238, + "grad_norm": 0.024485057219862938, + "learning_rate": 4.450233350506836e-05, + "loss": 0.0135, + "step": 323 + }, + { + "epoch": 2.9135802469135803, + "grad_norm": 0.019438456743955612, + "learning_rate": 4.418956432665618e-05, + "loss": 0.0108, + "step": 324 + }, + { + "epoch": 2.9225589225589226, + "grad_norm": 0.013679172843694687, + "learning_rate": 4.387702545459649e-05, + "loss": 0.0086, + "step": 325 + }, + { + "epoch": 2.9225589225589226, + "eval_loss": 0.012879169546067715, + "eval_runtime": 6.2664, + "eval_samples_per_second": 7.979, + "eval_steps_per_second": 2.075, + "step": 325 + }, + { + "epoch": 2.931537598204265, + "grad_norm": 0.018316788598895073, + "learning_rate": 4.356472927689109e-05, + "loss": 0.0112, + "step": 326 + }, + { + "epoch": 2.940516273849607, + "grad_norm": 0.01569702848792076, + "learning_rate": 4.32526881719222e-05, + "loss": 0.0129, + "step": 327 + }, + { + "epoch": 2.9494949494949494, + "grad_norm": 0.01372100692242384, + "learning_rate": 4.2940914507961775e-05, + "loss": 0.0104, + "step": 328 + }, + { + "epoch": 2.9584736251402917, + "grad_norm": 0.015811212360858917, + "learning_rate": 4.262942064268134e-05, + "loss": 0.0123, + "step": 329 + }, + { + "epoch": 2.9674523007856344, + "grad_norm": 0.018124472349882126, + "learning_rate": 4.23182189226621e-05, + "loss": 0.0132, + "step": 330 + }, + { + "epoch": 2.9674523007856344, + "eval_loss": 0.012610589154064655, + "eval_runtime": 6.2539, + "eval_samples_per_second": 7.995, + "eval_steps_per_second": 2.079, + "step": 330 + }, + { + "epoch": 2.9764309764309766, + "grad_norm": 0.015013212338089943, + "learning_rate": 4.20073216829056e-05, + "loss": 0.0124, + "step": 331 + }, + { + "epoch": 2.985409652076319, + "grad_norm": 0.013129116035997868, + "learning_rate": 4.169674124634481e-05, + "loss": 0.009, + "step": 332 + }, + { + "epoch": 2.994388327721661, + "grad_norm": 0.01896873489022255, + "learning_rate": 4.138648992335566e-05, + "loss": 0.0131, + "step": 333 + }, + { + "epoch": 3.005611672278339, + "grad_norm": 0.03602827712893486, + "learning_rate": 4.107658001126913e-05, + "loss": 0.0173, + "step": 334 + }, + { + "epoch": 3.014590347923681, + "grad_norm": 0.021112319082021713, + "learning_rate": 4.0767023793883785e-05, + "loss": 0.0126, + "step": 335 + }, + { + "epoch": 3.014590347923681, + "eval_loss": 0.012994157150387764, + "eval_runtime": 6.2538, + "eval_samples_per_second": 7.995, + "eval_steps_per_second": 2.079, + "step": 335 + }, + { + "epoch": 3.0235690235690234, + "grad_norm": 0.01987135224044323, + "learning_rate": 4.045783354097893e-05, + "loss": 0.0092, + "step": 336 + }, + { + "epoch": 3.032547699214366, + "grad_norm": 0.025037603452801704, + "learning_rate": 4.0149021507828224e-05, + "loss": 0.0131, + "step": 337 + }, + { + "epoch": 3.0415263748597083, + "grad_norm": 0.017175879329442978, + "learning_rate": 3.984059993471399e-05, + "loss": 0.0086, + "step": 338 + }, + { + "epoch": 3.0505050505050506, + "grad_norm": 0.018214913085103035, + "learning_rate": 3.9532581046442e-05, + "loss": 0.0104, + "step": 339 + }, + { + "epoch": 3.059483726150393, + "grad_norm": 0.016980677843093872, + "learning_rate": 3.9224977051856904e-05, + "loss": 0.0117, + "step": 340 + }, + { + "epoch": 3.059483726150393, + "eval_loss": 0.013331728056073189, + "eval_runtime": 6.2785, + "eval_samples_per_second": 7.964, + "eval_steps_per_second": 2.071, + "step": 340 + }, + { + "epoch": 3.068462401795735, + "grad_norm": 0.016574935987591743, + "learning_rate": 3.8917800143358404e-05, + "loss": 0.0077, + "step": 341 + }, + { + "epoch": 3.0774410774410774, + "grad_norm": 0.020380405709147453, + "learning_rate": 3.861106249641789e-05, + "loss": 0.0097, + "step": 342 + }, + { + "epoch": 3.0864197530864197, + "grad_norm": 0.026123059913516045, + "learning_rate": 3.830477626909589e-05, + "loss": 0.0125, + "step": 343 + }, + { + "epoch": 3.095398428731762, + "grad_norm": 0.02141587808728218, + "learning_rate": 3.7998953601560175e-05, + "loss": 0.0111, + "step": 344 + }, + { + "epoch": 3.1043771043771042, + "grad_norm": 0.021793678402900696, + "learning_rate": 3.769360661560453e-05, + "loss": 0.0102, + "step": 345 + }, + { + "epoch": 3.1043771043771042, + "eval_loss": 0.013248049654066563, + "eval_runtime": 6.2478, + "eval_samples_per_second": 8.003, + "eval_steps_per_second": 2.081, + "step": 345 + }, + { + "epoch": 3.1133557800224465, + "grad_norm": 0.013916864059865475, + "learning_rate": 3.73887474141683e-05, + "loss": 0.0088, + "step": 346 + }, + { + "epoch": 3.122334455667789, + "grad_norm": 0.015638204291462898, + "learning_rate": 3.708438808085668e-05, + "loss": 0.01, + "step": 347 + }, + { + "epoch": 3.1313131313131315, + "grad_norm": 0.017211005091667175, + "learning_rate": 3.6780540679461784e-05, + "loss": 0.0091, + "step": 348 + }, + { + "epoch": 3.1402918069584738, + "grad_norm": 0.02173846773803234, + "learning_rate": 3.64772172534844e-05, + "loss": 0.0118, + "step": 349 + }, + { + "epoch": 3.149270482603816, + "grad_norm": 0.012991265393793583, + "learning_rate": 3.6174429825656685e-05, + "loss": 0.0074, + "step": 350 + }, + { + "epoch": 3.149270482603816, + "eval_loss": 0.013174821622669697, + "eval_runtime": 6.296, + "eval_samples_per_second": 7.942, + "eval_steps_per_second": 2.065, + "step": 350 + }, + { + "epoch": 3.1582491582491583, + "grad_norm": 0.02047666721045971, + "learning_rate": 3.587219039746564e-05, + "loss": 0.0124, + "step": 351 + }, + { + "epoch": 3.1672278338945006, + "grad_norm": 0.0238481592386961, + "learning_rate": 3.557051094867735e-05, + "loss": 0.0122, + "step": 352 + }, + { + "epoch": 3.176206509539843, + "grad_norm": 0.01623239740729332, + "learning_rate": 3.5269403436862175e-05, + "loss": 0.0089, + "step": 353 + }, + { + "epoch": 3.185185185185185, + "grad_norm": 0.019198792055249214, + "learning_rate": 3.496887979692084e-05, + "loss": 0.0111, + "step": 354 + }, + { + "epoch": 3.1941638608305274, + "grad_norm": 0.019949574023485184, + "learning_rate": 3.466895194061128e-05, + "loss": 0.0105, + "step": 355 + }, + { + "epoch": 3.1941638608305274, + "eval_loss": 0.01294049434363842, + "eval_runtime": 6.2659, + "eval_samples_per_second": 7.98, + "eval_steps_per_second": 2.075, + "step": 355 + }, + { + "epoch": 3.2031425364758697, + "grad_norm": 0.01906234584748745, + "learning_rate": 3.436963175607656e-05, + "loss": 0.0096, + "step": 356 + }, + { + "epoch": 3.212121212121212, + "grad_norm": 0.019578171893954277, + "learning_rate": 3.4070931107373675e-05, + "loss": 0.0092, + "step": 357 + }, + { + "epoch": 3.221099887766554, + "grad_norm": 0.015484723262488842, + "learning_rate": 3.377286183400328e-05, + "loss": 0.011, + "step": 358 + }, + { + "epoch": 3.230078563411897, + "grad_norm": 0.017633194103837013, + "learning_rate": 3.3475435750440356e-05, + "loss": 0.0101, + "step": 359 + }, + { + "epoch": 3.239057239057239, + "grad_norm": 0.020478319376707077, + "learning_rate": 3.3178664645666066e-05, + "loss": 0.0117, + "step": 360 + }, + { + "epoch": 3.239057239057239, + "eval_loss": 0.012872601859271526, + "eval_runtime": 6.2587, + "eval_samples_per_second": 7.989, + "eval_steps_per_second": 2.077, + "step": 360 + }, + { + "epoch": 3.2480359147025815, + "grad_norm": 0.015297391451895237, + "learning_rate": 3.2882560282700336e-05, + "loss": 0.0096, + "step": 361 + }, + { + "epoch": 3.2570145903479237, + "grad_norm": 0.01740657351911068, + "learning_rate": 3.258713439813566e-05, + "loss": 0.0105, + "step": 362 + }, + { + "epoch": 3.265993265993266, + "grad_norm": 0.019270438700914383, + "learning_rate": 3.229239870167191e-05, + "loss": 0.0103, + "step": 363 + }, + { + "epoch": 3.2749719416386083, + "grad_norm": 0.017028817906975746, + "learning_rate": 3.199836487565222e-05, + "loss": 0.0109, + "step": 364 + }, + { + "epoch": 3.2839506172839505, + "grad_norm": 0.0167669877409935, + "learning_rate": 3.170504457459989e-05, + "loss": 0.0107, + "step": 365 + }, + { + "epoch": 3.2839506172839505, + "eval_loss": 0.012733125127851963, + "eval_runtime": 6.2533, + "eval_samples_per_second": 7.996, + "eval_steps_per_second": 2.079, + "step": 365 + }, + { + "epoch": 3.292929292929293, + "grad_norm": 0.01785757951438427, + "learning_rate": 3.1412449424756474e-05, + "loss": 0.0097, + "step": 366 + }, + { + "epoch": 3.301907968574635, + "grad_norm": 0.01732640527188778, + "learning_rate": 3.112059102362093e-05, + "loss": 0.0106, + "step": 367 + }, + { + "epoch": 3.3108866442199774, + "grad_norm": 0.015373657457530499, + "learning_rate": 3.082948093948997e-05, + "loss": 0.0094, + "step": 368 + }, + { + "epoch": 3.31986531986532, + "grad_norm": 0.01795523799955845, + "learning_rate": 3.053913071099947e-05, + "loss": 0.0096, + "step": 369 + }, + { + "epoch": 3.3288439955106623, + "grad_norm": 0.017400693148374557, + "learning_rate": 3.0249551846667207e-05, + "loss": 0.0098, + "step": 370 + }, + { + "epoch": 3.3288439955106623, + "eval_loss": 0.012761359103024006, + "eval_runtime": 6.2866, + "eval_samples_per_second": 7.953, + "eval_steps_per_second": 2.068, + "step": 370 + }, + { + "epoch": 3.3378226711560046, + "grad_norm": 0.02055426687002182, + "learning_rate": 2.996075582443658e-05, + "loss": 0.0115, + "step": 371 + }, + { + "epoch": 3.346801346801347, + "grad_norm": 0.014372746460139751, + "learning_rate": 2.9672754091221805e-05, + "loss": 0.0086, + "step": 372 + }, + { + "epoch": 3.355780022446689, + "grad_norm": 0.01651662401854992, + "learning_rate": 2.938555806245406e-05, + "loss": 0.0106, + "step": 373 + }, + { + "epoch": 3.3647586980920314, + "grad_norm": 0.016991253942251205, + "learning_rate": 2.9099179121629117e-05, + "loss": 0.0115, + "step": 374 + }, + { + "epoch": 3.3737373737373737, + "grad_norm": 0.016502108424901962, + "learning_rate": 2.881362861985606e-05, + "loss": 0.0092, + "step": 375 + }, + { + "epoch": 3.3737373737373737, + "eval_loss": 0.012749603018164635, + "eval_runtime": 6.2727, + "eval_samples_per_second": 7.971, + "eval_steps_per_second": 2.072, + "step": 375 + }, + { + "epoch": 3.382716049382716, + "grad_norm": 0.015984639525413513, + "learning_rate": 2.8528917875407433e-05, + "loss": 0.0094, + "step": 376 + }, + { + "epoch": 3.3916947250280582, + "grad_norm": 0.03028254769742489, + "learning_rate": 2.8245058173270622e-05, + "loss": 0.0102, + "step": 377 + }, + { + "epoch": 3.4006734006734005, + "grad_norm": 0.021088914945721626, + "learning_rate": 2.796206076470044e-05, + "loss": 0.0082, + "step": 378 + }, + { + "epoch": 3.409652076318743, + "grad_norm": 0.019139107316732407, + "learning_rate": 2.7679936866773315e-05, + "loss": 0.0095, + "step": 379 + }, + { + "epoch": 3.418630751964085, + "grad_norm": 0.018426954746246338, + "learning_rate": 2.739869766194263e-05, + "loss": 0.0114, + "step": 380 + }, + { + "epoch": 3.418630751964085, + "eval_loss": 0.012560264207422733, + "eval_runtime": 6.2597, + "eval_samples_per_second": 7.988, + "eval_steps_per_second": 2.077, + "step": 380 + }, + { + "epoch": 3.4276094276094278, + "grad_norm": 0.019736235961318016, + "learning_rate": 2.7118354297595396e-05, + "loss": 0.01, + "step": 381 + }, + { + "epoch": 3.43658810325477, + "grad_norm": 0.015529734082520008, + "learning_rate": 2.683891788561055e-05, + "loss": 0.0109, + "step": 382 + }, + { + "epoch": 3.4455667789001123, + "grad_norm": 0.01170337200164795, + "learning_rate": 2.6560399501918465e-05, + "loss": 0.0083, + "step": 383 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 0.016096895560622215, + "learning_rate": 2.6282810186061862e-05, + "loss": 0.0095, + "step": 384 + }, + { + "epoch": 3.463524130190797, + "grad_norm": 0.022902794182300568, + "learning_rate": 2.600616094075835e-05, + "loss": 0.0118, + "step": 385 + }, + { + "epoch": 3.463524130190797, + "eval_loss": 0.012463411316275597, + "eval_runtime": 6.256, + "eval_samples_per_second": 7.992, + "eval_steps_per_second": 2.078, + "step": 385 + }, + { + "epoch": 3.472502805836139, + "grad_norm": 0.01635258086025715, + "learning_rate": 2.5730462731464273e-05, + "loss": 0.0106, + "step": 386 + }, + { + "epoch": 3.4814814814814814, + "grad_norm": 0.019135868176817894, + "learning_rate": 2.5455726485940012e-05, + "loss": 0.0088, + "step": 387 + }, + { + "epoch": 3.4904601571268237, + "grad_norm": 0.015336276032030582, + "learning_rate": 2.5181963093816962e-05, + "loss": 0.0086, + "step": 388 + }, + { + "epoch": 3.499438832772166, + "grad_norm": 0.019149543717503548, + "learning_rate": 2.4909183406165836e-05, + "loss": 0.0095, + "step": 389 + }, + { + "epoch": 3.5084175084175087, + "grad_norm": 0.01899711787700653, + "learning_rate": 2.4637398235066527e-05, + "loss": 0.0108, + "step": 390 + }, + { + "epoch": 3.5084175084175087, + "eval_loss": 0.012311117723584175, + "eval_runtime": 6.2629, + "eval_samples_per_second": 7.983, + "eval_steps_per_second": 2.076, + "step": 390 + }, + { + "epoch": 3.517396184062851, + "grad_norm": 0.025319568812847137, + "learning_rate": 2.4366618353179644e-05, + "loss": 0.0128, + "step": 391 + }, + { + "epoch": 3.526374859708193, + "grad_norm": 0.014999724924564362, + "learning_rate": 2.4096854493319477e-05, + "loss": 0.0089, + "step": 392 + }, + { + "epoch": 3.5353535353535355, + "grad_norm": 0.0133373336866498, + "learning_rate": 2.3828117348028528e-05, + "loss": 0.0088, + "step": 393 + }, + { + "epoch": 3.5443322109988777, + "grad_norm": 0.016392188146710396, + "learning_rate": 2.3560417569153796e-05, + "loss": 0.0096, + "step": 394 + }, + { + "epoch": 3.55331088664422, + "grad_norm": 0.017216209322214127, + "learning_rate": 2.3293765767424537e-05, + "loss": 0.0092, + "step": 395 + }, + { + "epoch": 3.55331088664422, + "eval_loss": 0.012257438153028488, + "eval_runtime": 6.2604, + "eval_samples_per_second": 7.987, + "eval_steps_per_second": 2.077, + "step": 395 + }, + { + "epoch": 3.5622895622895623, + "grad_norm": 0.021824954077601433, + "learning_rate": 2.3028172512031604e-05, + "loss": 0.0101, + "step": 396 + }, + { + "epoch": 3.5712682379349046, + "grad_norm": 0.017453951761126518, + "learning_rate": 2.276364833020868e-05, + "loss": 0.0103, + "step": 397 + }, + { + "epoch": 3.580246913580247, + "grad_norm": 0.014409264549612999, + "learning_rate": 2.2500203706814856e-05, + "loss": 0.0089, + "step": 398 + }, + { + "epoch": 3.589225589225589, + "grad_norm": 0.015458152629435062, + "learning_rate": 2.2237849083919142e-05, + "loss": 0.0097, + "step": 399 + }, + { + "epoch": 3.5982042648709314, + "grad_norm": 0.01887071318924427, + "learning_rate": 2.1976594860386597e-05, + "loss": 0.0085, + "step": 400 + }, + { + "epoch": 3.5982042648709314, + "eval_loss": 0.01230713166296482, + "eval_runtime": 6.2496, + "eval_samples_per_second": 8.0, + "eval_steps_per_second": 2.08, + "step": 400 + }, + { + "epoch": 3.6071829405162736, + "grad_norm": 0.019031310454010963, + "learning_rate": 2.1716451391466008e-05, + "loss": 0.01, + "step": 401 + }, + { + "epoch": 3.616161616161616, + "grad_norm": 0.016506759449839592, + "learning_rate": 2.1457428988379635e-05, + "loss": 0.01, + "step": 402 + }, + { + "epoch": 3.6251402918069586, + "grad_norm": 0.016758006066083908, + "learning_rate": 2.1199537917914386e-05, + "loss": 0.01, + "step": 403 + }, + { + "epoch": 3.634118967452301, + "grad_norm": 0.02015763334929943, + "learning_rate": 2.0942788402014867e-05, + "loss": 0.0097, + "step": 404 + }, + { + "epoch": 3.643097643097643, + "grad_norm": 0.021632149815559387, + "learning_rate": 2.068719061737831e-05, + "loss": 0.0088, + "step": 405 + }, + { + "epoch": 3.643097643097643, + "eval_loss": 0.012639960274100304, + "eval_runtime": 6.255, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 2.078, + "step": 405 + }, + { + "epoch": 3.6520763187429854, + "grad_norm": 0.020619019865989685, + "learning_rate": 2.0432754695051136e-05, + "loss": 0.0112, + "step": 406 + }, + { + "epoch": 3.6610549943883277, + "grad_norm": 0.018945058807730675, + "learning_rate": 2.0179490720027372e-05, + "loss": 0.0104, + "step": 407 + }, + { + "epoch": 3.67003367003367, + "grad_norm": 0.016077589243650436, + "learning_rate": 1.992740873084899e-05, + "loss": 0.0084, + "step": 408 + }, + { + "epoch": 3.6790123456790123, + "grad_norm": 0.01626184582710266, + "learning_rate": 1.9676518719207977e-05, + "loss": 0.0093, + "step": 409 + }, + { + "epoch": 3.6879910213243545, + "grad_norm": 0.01499089039862156, + "learning_rate": 1.9426830629550242e-05, + "loss": 0.0095, + "step": 410 + }, + { + "epoch": 3.6879910213243545, + "eval_loss": 0.012408388778567314, + "eval_runtime": 6.2471, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.081, + "step": 410 + }, + { + "epoch": 3.6969696969696972, + "grad_norm": 0.01810368523001671, + "learning_rate": 1.917835435868155e-05, + "loss": 0.0101, + "step": 411 + }, + { + "epoch": 3.7059483726150395, + "grad_norm": 0.016352152451872826, + "learning_rate": 1.8931099755375203e-05, + "loss": 0.0085, + "step": 412 + }, + { + "epoch": 3.714927048260382, + "grad_norm": 0.01950286328792572, + "learning_rate": 1.8685076619981608e-05, + "loss": 0.0102, + "step": 413 + }, + { + "epoch": 3.723905723905724, + "grad_norm": 0.016881579533219337, + "learning_rate": 1.844029470403993e-05, + "loss": 0.0097, + "step": 414 + }, + { + "epoch": 3.7328843995510663, + "grad_norm": 0.020003410056233406, + "learning_rate": 1.8196763709891524e-05, + "loss": 0.0072, + "step": 415 + }, + { + "epoch": 3.7328843995510663, + "eval_loss": 0.012383312918245792, + "eval_runtime": 6.2483, + "eval_samples_per_second": 8.002, + "eval_steps_per_second": 2.081, + "step": 415 + }, + { + "epoch": 3.7418630751964086, + "grad_norm": 0.021129626780748367, + "learning_rate": 1.795449329029531e-05, + "loss": 0.0103, + "step": 416 + }, + { + "epoch": 3.750841750841751, + "grad_norm": 0.02026401087641716, + "learning_rate": 1.7713493048045294e-05, + "loss": 0.0108, + "step": 417 + }, + { + "epoch": 3.759820426487093, + "grad_norm": 0.019643913954496384, + "learning_rate": 1.747377253558982e-05, + "loss": 0.0102, + "step": 418 + }, + { + "epoch": 3.7687991021324354, + "grad_norm": 0.02257615514099598, + "learning_rate": 1.7235341254653005e-05, + "loss": 0.0103, + "step": 419 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.018804267048835754, + "learning_rate": 1.6998208655858137e-05, + "loss": 0.0105, + "step": 420 + }, + { + "epoch": 3.7777777777777777, + "eval_loss": 0.012317318469285965, + "eval_runtime": 6.2923, + "eval_samples_per_second": 7.946, + "eval_steps_per_second": 2.066, + "step": 420 + }, + { + "epoch": 3.78675645342312, + "grad_norm": 0.01885891705751419, + "learning_rate": 1.6762384138353078e-05, + "loss": 0.011, + "step": 421 + }, + { + "epoch": 3.795735129068462, + "grad_norm": 0.016811871901154518, + "learning_rate": 1.6527877049437622e-05, + "loss": 0.0082, + "step": 422 + }, + { + "epoch": 3.8047138047138045, + "grad_norm": 0.01912999525666237, + "learning_rate": 1.6294696684193154e-05, + "loss": 0.0095, + "step": 423 + }, + { + "epoch": 3.8136924803591468, + "grad_norm": 0.01893465779721737, + "learning_rate": 1.6062852285114123e-05, + "loss": 0.0091, + "step": 424 + }, + { + "epoch": 3.8226711560044895, + "grad_norm": 0.019678136333823204, + "learning_rate": 1.583235304174167e-05, + "loss": 0.0115, + "step": 425 + }, + { + "epoch": 3.8226711560044895, + "eval_loss": 0.01215137168765068, + "eval_runtime": 6.2539, + "eval_samples_per_second": 7.995, + "eval_steps_per_second": 2.079, + "step": 425 + }, + { + "epoch": 3.8316498316498318, + "grad_norm": 0.017283251509070396, + "learning_rate": 1.5603208090299498e-05, + "loss": 0.0082, + "step": 426 + }, + { + "epoch": 3.840628507295174, + "grad_norm": 0.018386101350188255, + "learning_rate": 1.537542651333167e-05, + "loss": 0.0098, + "step": 427 + }, + { + "epoch": 3.8496071829405163, + "grad_norm": 0.02193758450448513, + "learning_rate": 1.5149017339342574e-05, + "loss": 0.0105, + "step": 428 + }, + { + "epoch": 3.8585858585858586, + "grad_norm": 0.025852926075458527, + "learning_rate": 1.4923989542439159e-05, + "loss": 0.0108, + "step": 429 + }, + { + "epoch": 3.867564534231201, + "grad_norm": 0.015607142820954323, + "learning_rate": 1.4700352041975168e-05, + "loss": 0.007, + "step": 430 + }, + { + "epoch": 3.867564534231201, + "eval_loss": 0.012098519131541252, + "eval_runtime": 6.2731, + "eval_samples_per_second": 7.971, + "eval_steps_per_second": 2.072, + "step": 430 + }, + { + "epoch": 3.876543209876543, + "grad_norm": 0.018707241863012314, + "learning_rate": 1.447811370219757e-05, + "loss": 0.01, + "step": 431 + }, + { + "epoch": 3.8855218855218854, + "grad_norm": 0.017066778615117073, + "learning_rate": 1.4257283331895315e-05, + "loss": 0.0082, + "step": 432 + }, + { + "epoch": 3.894500561167228, + "grad_norm": 0.02140822634100914, + "learning_rate": 1.4037869684050115e-05, + "loss": 0.0117, + "step": 433 + }, + { + "epoch": 3.9034792368125704, + "grad_norm": 0.020129108801484108, + "learning_rate": 1.3819881455489458e-05, + "loss": 0.0085, + "step": 434 + }, + { + "epoch": 3.9124579124579126, + "grad_norm": 0.020222559571266174, + "learning_rate": 1.3603327286542023e-05, + "loss": 0.0112, + "step": 435 + }, + { + "epoch": 3.9124579124579126, + "eval_loss": 0.012134820222854614, + "eval_runtime": 6.2796, + "eval_samples_per_second": 7.962, + "eval_steps_per_second": 2.07, + "step": 435 + }, + { + "epoch": 3.921436588103255, + "grad_norm": 0.019684189930558205, + "learning_rate": 1.33882157606951e-05, + "loss": 0.0092, + "step": 436 + }, + { + "epoch": 3.930415263748597, + "grad_norm": 0.01632900908589363, + "learning_rate": 1.317455540425439e-05, + "loss": 0.0074, + "step": 437 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.021984471008181572, + "learning_rate": 1.2962354686006084e-05, + "loss": 0.0115, + "step": 438 + }, + { + "epoch": 3.9483726150392817, + "grad_norm": 0.014504092745482922, + "learning_rate": 1.2751622016881182e-05, + "loss": 0.0085, + "step": 439 + }, + { + "epoch": 3.957351290684624, + "grad_norm": 0.027196183800697327, + "learning_rate": 1.2542365749622049e-05, + "loss": 0.0103, + "step": 440 + }, + { + "epoch": 3.957351290684624, + "eval_loss": 0.012064680457115173, + "eval_runtime": 6.2674, + "eval_samples_per_second": 7.978, + "eval_steps_per_second": 2.074, + "step": 440 + }, + { + "epoch": 3.9663299663299663, + "grad_norm": 0.02008941024541855, + "learning_rate": 1.2334594178451425e-05, + "loss": 0.0088, + "step": 441 + }, + { + "epoch": 3.9753086419753085, + "grad_norm": 0.017460942268371582, + "learning_rate": 1.2128315538743646e-05, + "loss": 0.0074, + "step": 442 + }, + { + "epoch": 3.984287317620651, + "grad_norm": 0.01718473620712757, + "learning_rate": 1.1923538006698154e-05, + "loss": 0.0099, + "step": 443 + }, + { + "epoch": 3.993265993265993, + "grad_norm": 0.017572911456227303, + "learning_rate": 1.172026969901553e-05, + "loss": 0.0089, + "step": 444 + }, + { + "epoch": 4.004489337822672, + "grad_norm": 0.05128241330385208, + "learning_rate": 1.1518518672575701e-05, + "loss": 0.0162, + "step": 445 + }, + { + "epoch": 4.004489337822672, + "eval_loss": 0.0121712451800704, + "eval_runtime": 6.2683, + "eval_samples_per_second": 7.977, + "eval_steps_per_second": 2.074, + "step": 445 + }, + { + "epoch": 4.013468013468014, + "grad_norm": 0.017392810434103012, + "learning_rate": 1.1318292924118584e-05, + "loss": 0.0092, + "step": 446 + }, + { + "epoch": 4.022446689113356, + "grad_norm": 0.014832595363259315, + "learning_rate": 1.1119600389927182e-05, + "loss": 0.0089, + "step": 447 + }, + { + "epoch": 4.031425364758698, + "grad_norm": 0.014501234516501427, + "learning_rate": 1.092244894551298e-05, + "loss": 0.0076, + "step": 448 + }, + { + "epoch": 4.040404040404041, + "grad_norm": 0.01787727326154709, + "learning_rate": 1.0726846405303754e-05, + "loss": 0.0091, + "step": 449 + }, + { + "epoch": 4.049382716049383, + "grad_norm": 0.018708007410168648, + "learning_rate": 1.0532800522333902e-05, + "loss": 0.0079, + "step": 450 + }, + { + "epoch": 4.049382716049383, + "eval_loss": 0.0124581940472126, + "eval_runtime": 6.2521, + "eval_samples_per_second": 7.997, + "eval_steps_per_second": 2.079, + "step": 450 + }, + { + "epoch": 4.058361391694725, + "grad_norm": 0.017758728936314583, + "learning_rate": 1.0340318987937097e-05, + "loss": 0.0086, + "step": 451 + }, + { + "epoch": 4.0673400673400675, + "grad_norm": 0.02003531903028488, + "learning_rate": 1.014940943144142e-05, + "loss": 0.0099, + "step": 452 + }, + { + "epoch": 4.07631874298541, + "grad_norm": 0.020393820479512215, + "learning_rate": 9.960079419866985e-06, + "loss": 0.0067, + "step": 453 + }, + { + "epoch": 4.085297418630752, + "grad_norm": 0.015872273594141006, + "learning_rate": 9.772336457626014e-06, + "loss": 0.0065, + "step": 454 + }, + { + "epoch": 4.094276094276094, + "grad_norm": 0.027125949040055275, + "learning_rate": 9.586187986225325e-06, + "loss": 0.0102, + "step": 455 + }, + { + "epoch": 4.094276094276094, + "eval_loss": 0.012580028735101223, + "eval_runtime": 6.2573, + "eval_samples_per_second": 7.991, + "eval_steps_per_second": 2.078, + "step": 455 + }, + { + "epoch": 4.103254769921437, + "grad_norm": 0.023796912282705307, + "learning_rate": 9.401641383971477e-06, + "loss": 0.01, + "step": 456 + }, + { + "epoch": 4.112233445566779, + "grad_norm": 0.02073628082871437, + "learning_rate": 9.218703965678204e-06, + "loss": 0.0101, + "step": 457 + }, + { + "epoch": 4.121212121212121, + "grad_norm": 0.01744106411933899, + "learning_rate": 9.03738298237658e-06, + "loss": 0.0063, + "step": 458 + }, + { + "epoch": 4.130190796857463, + "grad_norm": 0.023079855367541313, + "learning_rate": 8.857685621027568e-06, + "loss": 0.0078, + "step": 459 + }, + { + "epoch": 4.139169472502806, + "grad_norm": 0.020268626511096954, + "learning_rate": 8.67961900423711e-06, + "loss": 0.0087, + "step": 460 + }, + { + "epoch": 4.139169472502806, + "eval_loss": 0.012577519752085209, + "eval_runtime": 6.2777, + "eval_samples_per_second": 7.965, + "eval_steps_per_second": 2.071, + "step": 460 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 0.020592128857970238, + "learning_rate": 8.503190189973914e-06, + "loss": 0.0099, + "step": 461 + }, + { + "epoch": 4.15712682379349, + "grad_norm": 0.016794390976428986, + "learning_rate": 8.328406171289621e-06, + "loss": 0.008, + "step": 462 + }, + { + "epoch": 4.1661054994388325, + "grad_norm": 0.016793973743915558, + "learning_rate": 8.155273876041614e-06, + "loss": 0.0076, + "step": 463 + }, + { + "epoch": 4.175084175084175, + "grad_norm": 0.01984918676316738, + "learning_rate": 7.983800166618482e-06, + "loss": 0.0087, + "step": 464 + }, + { + "epoch": 4.184062850729517, + "grad_norm": 0.034957610070705414, + "learning_rate": 7.813991839667995e-06, + "loss": 0.0107, + "step": 465 + }, + { + "epoch": 4.184062850729517, + "eval_loss": 0.012637750245630741, + "eval_runtime": 6.3143, + "eval_samples_per_second": 7.919, + "eval_steps_per_second": 2.059, + "step": 465 + }, + { + "epoch": 4.193041526374859, + "grad_norm": 0.0235477052628994, + "learning_rate": 7.645855625827658e-06, + "loss": 0.007, + "step": 466 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.021280469372868538, + "learning_rate": 7.4793981894580034e-06, + "loss": 0.0071, + "step": 467 + }, + { + "epoch": 4.210998877665545, + "grad_norm": 0.022229325026273727, + "learning_rate": 7.3146261283784104e-06, + "loss": 0.0086, + "step": 468 + }, + { + "epoch": 4.219977553310887, + "grad_norm": 0.020385632291436195, + "learning_rate": 7.1515459736055505e-06, + "loss": 0.0071, + "step": 469 + }, + { + "epoch": 4.228956228956229, + "grad_norm": 0.018814127892255783, + "learning_rate": 6.990164189094589e-06, + "loss": 0.0105, + "step": 470 + }, + { + "epoch": 4.228956228956229, + "eval_loss": 0.012454288080334663, + "eval_runtime": 6.259, + "eval_samples_per_second": 7.988, + "eval_steps_per_second": 2.077, + "step": 470 + }, + { + "epoch": 4.2379349046015715, + "grad_norm": 0.022755559533834457, + "learning_rate": 6.830487171482935e-06, + "loss": 0.0085, + "step": 471 + }, + { + "epoch": 4.246913580246914, + "grad_norm": 0.019691364839673042, + "learning_rate": 6.6725212498366885e-06, + "loss": 0.0087, + "step": 472 + }, + { + "epoch": 4.255892255892256, + "grad_norm": 0.021712522953748703, + "learning_rate": 6.516272685399793e-06, + "loss": 0.009, + "step": 473 + }, + { + "epoch": 4.264870931537598, + "grad_norm": 0.015157987363636494, + "learning_rate": 6.36174767134588e-06, + "loss": 0.0056, + "step": 474 + }, + { + "epoch": 4.273849607182941, + "grad_norm": 0.01883016712963581, + "learning_rate": 6.208952332532786e-06, + "loss": 0.0089, + "step": 475 + }, + { + "epoch": 4.273849607182941, + "eval_loss": 0.012438948266208172, + "eval_runtime": 6.2582, + "eval_samples_per_second": 7.99, + "eval_steps_per_second": 2.077, + "step": 475 + }, + { + "epoch": 4.282828282828283, + "grad_norm": 0.018075603991746902, + "learning_rate": 6.057892725259717e-06, + "loss": 0.0079, + "step": 476 + }, + { + "epoch": 4.291806958473625, + "grad_norm": 0.021707097068428993, + "learning_rate": 5.908574837027309e-06, + "loss": 0.0086, + "step": 477 + }, + { + "epoch": 4.300785634118967, + "grad_norm": 0.029295941814780235, + "learning_rate": 5.761004586300234e-06, + "loss": 0.0092, + "step": 478 + }, + { + "epoch": 4.30976430976431, + "grad_norm": 0.015479645691812038, + "learning_rate": 5.615187822272583e-06, + "loss": 0.0073, + "step": 479 + }, + { + "epoch": 4.318742985409652, + "grad_norm": 0.017396703362464905, + "learning_rate": 5.4711303246361144e-06, + "loss": 0.0061, + "step": 480 + }, + { + "epoch": 4.318742985409652, + "eval_loss": 0.012455189600586891, + "eval_runtime": 6.2775, + "eval_samples_per_second": 7.965, + "eval_steps_per_second": 2.071, + "step": 480 + }, + { + "epoch": 4.327721661054994, + "grad_norm": 0.019893964752554893, + "learning_rate": 5.328837803351083e-06, + "loss": 0.008, + "step": 481 + }, + { + "epoch": 4.3367003367003365, + "grad_norm": 0.01733791083097458, + "learning_rate": 5.188315898419971e-06, + "loss": 0.0085, + "step": 482 + }, + { + "epoch": 4.345679012345679, + "grad_norm": 0.03376675769686699, + "learning_rate": 5.04957017966391e-06, + "loss": 0.0079, + "step": 483 + }, + { + "epoch": 4.354657687991021, + "grad_norm": 0.01885765977203846, + "learning_rate": 4.912606146501886e-06, + "loss": 0.0103, + "step": 484 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.023781761527061462, + "learning_rate": 4.777429227732844e-06, + "loss": 0.0074, + "step": 485 + }, + { + "epoch": 4.363636363636363, + "eval_loss": 0.012579456903040409, + "eval_runtime": 6.2532, + "eval_samples_per_second": 7.996, + "eval_steps_per_second": 2.079, + "step": 485 + }, + { + "epoch": 4.372615039281706, + "grad_norm": 0.020697975531220436, + "learning_rate": 4.644044781320422e-06, + "loss": 0.0082, + "step": 486 + }, + { + "epoch": 4.381593714927048, + "grad_norm": 0.01883615553379059, + "learning_rate": 4.5124580941806165e-06, + "loss": 0.0066, + "step": 487 + }, + { + "epoch": 4.390572390572391, + "grad_norm": 0.0223472248762846, + "learning_rate": 4.382674381972224e-06, + "loss": 0.0094, + "step": 488 + }, + { + "epoch": 4.399551066217733, + "grad_norm": 0.02023017778992653, + "learning_rate": 4.254698788890127e-06, + "loss": 0.0084, + "step": 489 + }, + { + "epoch": 4.408529741863076, + "grad_norm": 0.01804838329553604, + "learning_rate": 4.12853638746134e-06, + "loss": 0.008, + "step": 490 + }, + { + "epoch": 4.408529741863076, + "eval_loss": 0.01255893800407648, + "eval_runtime": 6.2608, + "eval_samples_per_second": 7.986, + "eval_steps_per_second": 2.076, + "step": 490 + }, + { + "epoch": 4.417508417508418, + "grad_norm": 0.02165042981505394, + "learning_rate": 4.004192178344029e-06, + "loss": 0.0089, + "step": 491 + }, + { + "epoch": 4.42648709315376, + "grad_norm": 0.018235059455037117, + "learning_rate": 3.881671090129247e-06, + "loss": 0.0074, + "step": 492 + }, + { + "epoch": 4.435465768799102, + "grad_norm": 0.01595655083656311, + "learning_rate": 3.7609779791455744e-06, + "loss": 0.0077, + "step": 493 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.01976379007101059, + "learning_rate": 3.6421176292666783e-06, + "loss": 0.0094, + "step": 494 + }, + { + "epoch": 4.453423120089787, + "grad_norm": 0.02034621126949787, + "learning_rate": 3.5250947517216637e-06, + "loss": 0.0092, + "step": 495 + }, + { + "epoch": 4.453423120089787, + "eval_loss": 0.012510711327195168, + "eval_runtime": 6.2609, + "eval_samples_per_second": 7.986, + "eval_steps_per_second": 2.076, + "step": 495 + }, + { + "epoch": 4.462401795735129, + "grad_norm": 0.017096711322665215, + "learning_rate": 3.4099139849083307e-06, + "loss": 0.0068, + "step": 496 + }, + { + "epoch": 4.4713804713804715, + "grad_norm": 0.01568697765469551, + "learning_rate": 3.296579894209345e-06, + "loss": 0.006, + "step": 497 + }, + { + "epoch": 4.480359147025814, + "grad_norm": 0.03203177452087402, + "learning_rate": 3.1850969718112745e-06, + "loss": 0.0101, + "step": 498 + }, + { + "epoch": 4.489337822671156, + "grad_norm": 0.021872689947485924, + "learning_rate": 3.0754696365265068e-06, + "loss": 0.0071, + "step": 499 + }, + { + "epoch": 4.498316498316498, + "grad_norm": 0.02181348390877247, + "learning_rate": 2.9677022336181413e-06, + "loss": 0.0092, + "step": 500 + }, + { + "epoch": 4.498316498316498, + "eval_loss": 0.012468446046113968, + "eval_runtime": 6.2655, + "eval_samples_per_second": 7.98, + "eval_steps_per_second": 2.075, + "step": 500 + }, + { + "epoch": 4.5072951739618405, + "grad_norm": 0.01611742191016674, + "learning_rate": 2.8617990346277657e-06, + "loss": 0.0057, + "step": 501 + }, + { + "epoch": 4.516273849607183, + "grad_norm": 0.020343905314803123, + "learning_rate": 2.7577642372060673e-06, + "loss": 0.0086, + "step": 502 + }, + { + "epoch": 4.525252525252525, + "grad_norm": 0.021957550197839737, + "learning_rate": 2.6556019649465525e-06, + "loss": 0.0103, + "step": 503 + }, + { + "epoch": 4.534231200897867, + "grad_norm": 0.017281070351600647, + "learning_rate": 2.5553162672220465e-06, + "loss": 0.008, + "step": 504 + }, + { + "epoch": 4.54320987654321, + "grad_norm": 0.015391329303383827, + "learning_rate": 2.45691111902418e-06, + "loss": 0.0061, + "step": 505 + }, + { + "epoch": 4.54320987654321, + "eval_loss": 0.012444855645298958, + "eval_runtime": 6.2516, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.079, + "step": 505 + }, + { + "epoch": 4.552188552188552, + "grad_norm": 0.02002757415175438, + "learning_rate": 2.360390420805869e-06, + "loss": 0.0099, + "step": 506 + }, + { + "epoch": 4.561167227833894, + "grad_norm": 0.019125554710626602, + "learning_rate": 2.2657579983267064e-06, + "loss": 0.0059, + "step": 507 + }, + { + "epoch": 4.570145903479236, + "grad_norm": 0.017187196761369705, + "learning_rate": 2.1730176025012816e-06, + "loss": 0.0081, + "step": 508 + }, + { + "epoch": 4.57912457912458, + "grad_norm": 0.017918284982442856, + "learning_rate": 2.082172909250568e-06, + "loss": 0.0082, + "step": 509 + }, + { + "epoch": 4.588103254769921, + "grad_norm": 0.02295687235891819, + "learning_rate": 1.993227519356189e-06, + "loss": 0.0089, + "step": 510 + }, + { + "epoch": 4.588103254769921, + "eval_loss": 0.012445243075489998, + "eval_runtime": 6.2516, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.079, + "step": 510 + }, + { + "epoch": 4.597081930415264, + "grad_norm": 0.017204539850354195, + "learning_rate": 1.906184958317664e-06, + "loss": 0.0069, + "step": 511 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.019010348245501518, + "learning_rate": 1.8210486762127499e-06, + "loss": 0.0084, + "step": 512 + }, + { + "epoch": 4.615039281705949, + "grad_norm": 0.017709147185087204, + "learning_rate": 1.737822047560611e-06, + "loss": 0.0076, + "step": 513 + }, + { + "epoch": 4.624017957351291, + "grad_norm": 0.017709005624055862, + "learning_rate": 1.656508371188109e-06, + "loss": 0.0088, + "step": 514 + }, + { + "epoch": 4.632996632996633, + "grad_norm": 0.01932976208627224, + "learning_rate": 1.5771108700990412e-06, + "loss": 0.01, + "step": 515 + }, + { + "epoch": 4.632996632996633, + "eval_loss": 0.012414975091814995, + "eval_runtime": 6.2511, + "eval_samples_per_second": 7.999, + "eval_steps_per_second": 2.08, + "step": 515 + }, + { + "epoch": 4.6419753086419755, + "grad_norm": 0.015371643006801605, + "learning_rate": 1.4996326913463754e-06, + "loss": 0.0058, + "step": 516 + }, + { + "epoch": 4.650953984287318, + "grad_norm": 0.02080700546503067, + "learning_rate": 1.4240769059075342e-06, + "loss": 0.0089, + "step": 517 + }, + { + "epoch": 4.65993265993266, + "grad_norm": 0.019819900393486023, + "learning_rate": 1.3504465085626638e-06, + "loss": 0.0059, + "step": 518 + }, + { + "epoch": 4.668911335578002, + "grad_norm": 0.023678896948695183, + "learning_rate": 1.2787444177759068e-06, + "loss": 0.0075, + "step": 519 + }, + { + "epoch": 4.677890011223345, + "grad_norm": 0.019294695928692818, + "learning_rate": 1.208973475579761e-06, + "loss": 0.0081, + "step": 520 + }, + { + "epoch": 4.677890011223345, + "eval_loss": 0.012423361651599407, + "eval_runtime": 6.2709, + "eval_samples_per_second": 7.973, + "eval_steps_per_second": 2.073, + "step": 520 + }, + { + "epoch": 4.686868686868687, + "grad_norm": 0.019210556522011757, + "learning_rate": 1.1411364474624264e-06, + "loss": 0.007, + "step": 521 + }, + { + "epoch": 4.695847362514029, + "grad_norm": 0.020206844434142113, + "learning_rate": 1.075236022258147e-06, + "loss": 0.009, + "step": 522 + }, + { + "epoch": 4.704826038159371, + "grad_norm": 0.018495453521609306, + "learning_rate": 1.0112748120406856e-06, + "loss": 0.0092, + "step": 523 + }, + { + "epoch": 4.713804713804714, + "grad_norm": 0.015705464407801628, + "learning_rate": 9.492553520197733e-07, + "loss": 0.0072, + "step": 524 + }, + { + "epoch": 4.722783389450056, + "grad_norm": 0.020354999229311943, + "learning_rate": 8.891801004406119e-07, + "loss": 0.0072, + "step": 525 + }, + { + "epoch": 4.722783389450056, + "eval_loss": 0.01243152841925621, + "eval_runtime": 6.2577, + "eval_samples_per_second": 7.99, + "eval_steps_per_second": 2.077, + "step": 525 + }, + { + "epoch": 4.731762065095398, + "grad_norm": 0.022362370043992996, + "learning_rate": 8.31051438486441e-07, + "loss": 0.0089, + "step": 526 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 0.022488482296466827, + "learning_rate": 7.748716701841685e-07, + "loss": 0.0097, + "step": 527 + }, + { + "epoch": 4.749719416386083, + "grad_norm": 0.017749127000570297, + "learning_rate": 7.206430223130278e-07, + "loss": 0.0054, + "step": 528 + }, + { + "epoch": 4.758698092031425, + "grad_norm": 0.023475971072912216, + "learning_rate": 6.683676443163311e-07, + "loss": 0.0114, + "step": 529 + }, + { + "epoch": 4.767676767676767, + "grad_norm": 0.020731423050165176, + "learning_rate": 6.180476082162656e-07, + "loss": 0.0078, + "step": 530 + }, + { + "epoch": 4.767676767676767, + "eval_loss": 0.012432167306542397, + "eval_runtime": 6.2519, + "eval_samples_per_second": 7.998, + "eval_steps_per_second": 2.079, + "step": 530 + }, + { + "epoch": 4.77665544332211, + "grad_norm": 0.018397077918052673, + "learning_rate": 5.696849085317646e-07, + "loss": 0.0068, + "step": 531 + }, + { + "epoch": 4.785634118967453, + "grad_norm": 0.01895746774971485, + "learning_rate": 5.232814621994598e-07, + "loss": 0.0084, + "step": 532 + }, + { + "epoch": 4.794612794612795, + "grad_norm": 0.020780237391591072, + "learning_rate": 4.788391084976862e-07, + "loss": 0.0097, + "step": 533 + }, + { + "epoch": 4.803591470258137, + "grad_norm": 0.018094424158334732, + "learning_rate": 4.363596089735911e-07, + "loss": 0.0075, + "step": 534 + }, + { + "epoch": 4.8125701459034795, + "grad_norm": 0.02267824485898018, + "learning_rate": 3.958446473733002e-07, + "loss": 0.009, + "step": 535 + }, + { + "epoch": 4.8125701459034795, + "eval_loss": 0.012407775036990643, + "eval_runtime": 6.269, + "eval_samples_per_second": 7.976, + "eval_steps_per_second": 2.074, + "step": 535 + }, + { + "epoch": 4.821548821548822, + "grad_norm": 0.016193151473999023, + "learning_rate": 3.572958295752049e-07, + "loss": 0.0066, + "step": 536 + }, + { + "epoch": 4.830527497194164, + "grad_norm": 0.016786446794867516, + "learning_rate": 3.207146835262742e-07, + "loss": 0.0063, + "step": 537 + }, + { + "epoch": 4.839506172839506, + "grad_norm": 0.019592830911278725, + "learning_rate": 2.8610265918151414e-07, + "loss": 0.0093, + "step": 538 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.017766350880265236, + "learning_rate": 2.534611284465083e-07, + "loss": 0.0076, + "step": 539 + }, + { + "epoch": 4.857463524130191, + "grad_norm": 0.02071945182979107, + "learning_rate": 2.2279138512300567e-07, + "loss": 0.0106, + "step": 540 + }, + { + "epoch": 4.857463524130191, + "eval_loss": 0.01241106167435646, + "eval_runtime": 6.2616, + "eval_samples_per_second": 7.985, + "eval_steps_per_second": 2.076, + "step": 540 + }, + { + "epoch": 4.866442199775533, + "grad_norm": 0.017140503972768784, + "learning_rate": 1.940946448576675e-07, + "loss": 0.0063, + "step": 541 + }, + { + "epoch": 4.875420875420875, + "grad_norm": 0.017304055392742157, + "learning_rate": 1.6737204509387206e-07, + "loss": 0.0075, + "step": 542 + }, + { + "epoch": 4.884399551066218, + "grad_norm": 0.018793294206261635, + "learning_rate": 1.4262464502663443e-07, + "loss": 0.0099, + "step": 543 + }, + { + "epoch": 4.89337822671156, + "grad_norm": 0.018352854996919632, + "learning_rate": 1.1985342556060652e-07, + "loss": 0.0074, + "step": 544 + }, + { + "epoch": 4.902356902356902, + "grad_norm": 0.018267234787344933, + "learning_rate": 9.905928927123609e-08, + "loss": 0.0079, + "step": 545 + }, + { + "epoch": 4.902356902356902, + "eval_loss": 0.012396564707159996, + "eval_runtime": 6.2557, + "eval_samples_per_second": 7.993, + "eval_steps_per_second": 2.078, + "step": 545 + }, + { + "epoch": 4.9113355780022445, + "grad_norm": 0.02004612796008587, + "learning_rate": 8.02430603689397e-08, + "loss": 0.0079, + "step": 546 + }, + { + "epoch": 4.920314253647587, + "grad_norm": 0.017602894455194473, + "learning_rate": 6.340548466648443e-08, + "loss": 0.0084, + "step": 547 + }, + { + "epoch": 4.929292929292929, + "grad_norm": 0.017070379108190536, + "learning_rate": 4.8547229549383844e-08, + "loss": 0.0093, + "step": 548 + }, + { + "epoch": 4.938271604938271, + "grad_norm": 0.015886155888438225, + "learning_rate": 3.566888394948009e-08, + "loss": 0.0074, + "step": 549 + }, + { + "epoch": 4.947250280583614, + "grad_norm": 0.018648440018296242, + "learning_rate": 2.4770958321568283e-08, + "loss": 0.0082, + "step": 550 + }, + { + "epoch": 4.947250280583614, + "eval_loss": 0.01241134200245142, + "eval_runtime": 6.2543, + "eval_samples_per_second": 7.995, + "eval_steps_per_second": 2.079, + "step": 550 + }, + { + "epoch": 4.956228956228956, + "grad_norm": 0.01602097600698471, + "learning_rate": 1.5853884623195925e-08, + "loss": 0.0063, + "step": 551 + }, + { + "epoch": 4.965207631874298, + "grad_norm": 0.02373385988175869, + "learning_rate": 8.918016297515541e-09, + "loss": 0.0098, + "step": 552 + }, + { + "epoch": 4.974186307519641, + "grad_norm": 0.021371588110923767, + "learning_rate": 3.963628259290308e-09, + "loss": 0.0088, + "step": 553 + }, + { + "epoch": 4.983164983164983, + "grad_norm": 0.017986396327614784, + "learning_rate": 9.90916883986115e-10, + "loss": 0.0077, + "step": 554 + }, + { + "epoch": 4.992143658810326, + "grad_norm": 0.023325249552726746, + "learning_rate": 0.0, + "loss": 0.0082, + "step": 555 + }, + { + "epoch": 4.992143658810326, + "eval_loss": 0.012434104457497597, + "eval_runtime": 6.2582, + "eval_samples_per_second": 7.989, + "eval_steps_per_second": 2.077, + "step": 555 + }, + { + "epoch": 4.992143658810326, + "step": 555, + "total_flos": 9.281622886148997e+17, + "train_loss": 0.013788488566002868, + "train_runtime": 7381.5879, + "train_samples_per_second": 2.412, + "train_steps_per_second": 0.075 + } + ], + "logging_steps": 1, + "max_steps": 555, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.281622886148997e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}