{ "best_metric": 2.046032667160034, "best_model_checkpoint": "/home/sunggeunan/data/ICL/outputs/lora/SKIML-ICL_mrqa_nq_v3/Meta-Llama-3-8B-Instruct-unanswerable-5Q-0U-0C-qa_first/checkpoint-512", "epoch": 0.9982939312698026, "eval_steps": 500, "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019497928345113332, "grad_norm": 0.2671431005001068, "learning_rate": 6.493506493506494e-09, "loss": 2.0238, "step": 1 }, { "epoch": 0.0038995856690226664, "grad_norm": 0.26295146346092224, "learning_rate": 1.2987012987012988e-08, "loss": 2.0792, "step": 2 }, { "epoch": 0.005849378503534, "grad_norm": 0.26511502265930176, "learning_rate": 1.9480519480519478e-08, "loss": 2.1298, "step": 3 }, { "epoch": 0.007799171338045333, "grad_norm": 0.268216997385025, "learning_rate": 2.5974025974025976e-08, "loss": 2.0854, "step": 4 }, { "epoch": 0.009748964172556666, "grad_norm": 0.2698403000831604, "learning_rate": 3.246753246753246e-08, "loss": 2.0665, "step": 5 }, { "epoch": 0.011698757007068, "grad_norm": 0.2657904624938965, "learning_rate": 3.8961038961038956e-08, "loss": 2.0213, "step": 6 }, { "epoch": 0.013648549841579332, "grad_norm": 0.2607410252094269, "learning_rate": 4.545454545454545e-08, "loss": 2.0425, "step": 7 }, { "epoch": 0.015598342676090666, "grad_norm": 0.28946036100387573, "learning_rate": 5.194805194805195e-08, "loss": 2.0742, "step": 8 }, { "epoch": 0.017548135510601998, "grad_norm": 0.250527948141098, "learning_rate": 5.844155844155844e-08, "loss": 2.1037, "step": 9 }, { "epoch": 0.01949792834511333, "grad_norm": 0.29370346665382385, "learning_rate": 6.493506493506492e-08, "loss": 2.1355, "step": 10 }, { "epoch": 0.021447721179624665, "grad_norm": 0.2751532196998596, "learning_rate": 7.142857142857142e-08, "loss": 2.1219, "step": 11 }, { "epoch": 0.023397514014136, "grad_norm": 0.2966114282608032, "learning_rate": 7.792207792207791e-08, "loss": 2.1788, "step": 12 }, { "epoch": 0.02534730684864733, "grad_norm": 0.24350005388259888, "learning_rate": 8.441558441558441e-08, "loss": 2.0531, "step": 13 }, { "epoch": 0.027297099683158663, "grad_norm": 0.2536744177341461, "learning_rate": 9.09090909090909e-08, "loss": 2.0485, "step": 14 }, { "epoch": 0.029246892517669997, "grad_norm": 0.2583434581756592, "learning_rate": 9.74025974025974e-08, "loss": 2.0712, "step": 15 }, { "epoch": 0.03119668535218133, "grad_norm": 0.25572890043258667, "learning_rate": 1.038961038961039e-07, "loss": 2.0674, "step": 16 }, { "epoch": 0.03314647818669266, "grad_norm": 0.24798272550106049, "learning_rate": 1.1038961038961038e-07, "loss": 1.9777, "step": 17 }, { "epoch": 0.035096271021203995, "grad_norm": 0.25968796014785767, "learning_rate": 1.1688311688311688e-07, "loss": 2.1233, "step": 18 }, { "epoch": 0.03704606385571533, "grad_norm": 0.2510642111301422, "learning_rate": 1.2337662337662337e-07, "loss": 2.0819, "step": 19 }, { "epoch": 0.03899585669022666, "grad_norm": 0.2636696696281433, "learning_rate": 1.2987012987012984e-07, "loss": 2.1369, "step": 20 }, { "epoch": 0.040945649524738, "grad_norm": 0.26741182804107666, "learning_rate": 1.3636363636363635e-07, "loss": 2.0973, "step": 21 }, { "epoch": 0.04289544235924933, "grad_norm": 0.2516593933105469, "learning_rate": 1.4285714285714285e-07, "loss": 2.1089, "step": 22 }, { "epoch": 0.044845235193760664, "grad_norm": 0.2642120122909546, "learning_rate": 1.4935064935064935e-07, "loss": 2.069, "step": 23 }, { "epoch": 0.046795028028272, "grad_norm": 0.2595269978046417, "learning_rate": 1.5584415584415582e-07, "loss": 2.1304, "step": 24 }, { "epoch": 0.04874482086278333, "grad_norm": 0.2557779848575592, "learning_rate": 1.6233766233766232e-07, "loss": 2.0084, "step": 25 }, { "epoch": 0.05069461369729466, "grad_norm": 0.26405468583106995, "learning_rate": 1.6883116883116883e-07, "loss": 2.0683, "step": 26 }, { "epoch": 0.05264440653180599, "grad_norm": 0.2540312111377716, "learning_rate": 1.7532467532467533e-07, "loss": 2.1389, "step": 27 }, { "epoch": 0.05459419936631733, "grad_norm": 0.2732296586036682, "learning_rate": 1.818181818181818e-07, "loss": 2.0663, "step": 28 }, { "epoch": 0.05654399220082866, "grad_norm": 0.2802280783653259, "learning_rate": 1.883116883116883e-07, "loss": 2.0758, "step": 29 }, { "epoch": 0.058493785035339994, "grad_norm": 0.2741639018058777, "learning_rate": 1.948051948051948e-07, "loss": 2.0638, "step": 30 }, { "epoch": 0.06044357786985133, "grad_norm": 0.2648272216320038, "learning_rate": 2.012987012987013e-07, "loss": 2.0978, "step": 31 }, { "epoch": 0.06239337070436266, "grad_norm": 0.2700302004814148, "learning_rate": 2.077922077922078e-07, "loss": 2.1145, "step": 32 }, { "epoch": 0.064343163538874, "grad_norm": 0.24180686473846436, "learning_rate": 2.1428571428571426e-07, "loss": 2.0752, "step": 33 }, { "epoch": 0.06629295637338532, "grad_norm": 0.27451491355895996, "learning_rate": 2.2077922077922076e-07, "loss": 2.0719, "step": 34 }, { "epoch": 0.06824274920789666, "grad_norm": 0.2594657838344574, "learning_rate": 2.2727272727272726e-07, "loss": 2.0107, "step": 35 }, { "epoch": 0.07019254204240799, "grad_norm": 0.26720282435417175, "learning_rate": 2.3376623376623376e-07, "loss": 2.1045, "step": 36 }, { "epoch": 0.07214233487691933, "grad_norm": 0.2727048695087433, "learning_rate": 2.4025974025974024e-07, "loss": 2.0983, "step": 37 }, { "epoch": 0.07409212771143066, "grad_norm": 0.2821039855480194, "learning_rate": 2.4675324675324674e-07, "loss": 2.1199, "step": 38 }, { "epoch": 0.076041920545942, "grad_norm": 0.2540994882583618, "learning_rate": 2.532467532467532e-07, "loss": 2.0925, "step": 39 }, { "epoch": 0.07799171338045333, "grad_norm": 0.2766543924808502, "learning_rate": 2.597402597402597e-07, "loss": 2.1259, "step": 40 }, { "epoch": 0.07994150621496467, "grad_norm": 0.28683698177337646, "learning_rate": 2.662337662337662e-07, "loss": 2.135, "step": 41 }, { "epoch": 0.081891299049476, "grad_norm": 0.25892165303230286, "learning_rate": 2.727272727272727e-07, "loss": 2.0734, "step": 42 }, { "epoch": 0.08384109188398732, "grad_norm": 0.2723507881164551, "learning_rate": 2.792207792207792e-07, "loss": 2.0313, "step": 43 }, { "epoch": 0.08579088471849866, "grad_norm": 0.25262904167175293, "learning_rate": 2.857142857142857e-07, "loss": 2.0777, "step": 44 }, { "epoch": 0.08774067755300999, "grad_norm": 0.26076266169548035, "learning_rate": 2.922077922077922e-07, "loss": 2.0877, "step": 45 }, { "epoch": 0.08969047038752133, "grad_norm": 0.2711774408817291, "learning_rate": 2.987012987012987e-07, "loss": 2.063, "step": 46 }, { "epoch": 0.09164026322203266, "grad_norm": 0.24715273082256317, "learning_rate": 3.0519480519480515e-07, "loss": 2.0698, "step": 47 }, { "epoch": 0.093590056056544, "grad_norm": 0.2721501588821411, "learning_rate": 3.1168831168831165e-07, "loss": 2.0192, "step": 48 }, { "epoch": 0.09553984889105532, "grad_norm": 0.2476457953453064, "learning_rate": 3.1818181818181815e-07, "loss": 2.0208, "step": 49 }, { "epoch": 0.09748964172556666, "grad_norm": 0.26186031103134155, "learning_rate": 3.2467532467532465e-07, "loss": 2.1028, "step": 50 }, { "epoch": 0.09943943456007799, "grad_norm": 0.263841450214386, "learning_rate": 3.3116883116883115e-07, "loss": 2.071, "step": 51 }, { "epoch": 0.10138922739458932, "grad_norm": 0.27216637134552, "learning_rate": 3.3766233766233765e-07, "loss": 2.0743, "step": 52 }, { "epoch": 0.10333902022910066, "grad_norm": 0.25524261593818665, "learning_rate": 3.4415584415584415e-07, "loss": 2.0426, "step": 53 }, { "epoch": 0.10528881306361199, "grad_norm": 0.2809346914291382, "learning_rate": 3.5064935064935066e-07, "loss": 2.049, "step": 54 }, { "epoch": 0.10723860589812333, "grad_norm": 0.25672242045402527, "learning_rate": 3.5714285714285716e-07, "loss": 2.0213, "step": 55 }, { "epoch": 0.10918839873263465, "grad_norm": 0.2544190585613251, "learning_rate": 3.636363636363636e-07, "loss": 2.0663, "step": 56 }, { "epoch": 0.111138191567146, "grad_norm": 0.26028168201446533, "learning_rate": 3.701298701298701e-07, "loss": 2.0947, "step": 57 }, { "epoch": 0.11308798440165732, "grad_norm": 0.26112449169158936, "learning_rate": 3.766233766233766e-07, "loss": 2.0611, "step": 58 }, { "epoch": 0.11503777723616866, "grad_norm": 0.29020223021507263, "learning_rate": 3.831168831168831e-07, "loss": 2.1048, "step": 59 }, { "epoch": 0.11698757007067999, "grad_norm": 0.269167959690094, "learning_rate": 3.896103896103896e-07, "loss": 2.0392, "step": 60 }, { "epoch": 0.11893736290519133, "grad_norm": 0.2823875844478607, "learning_rate": 3.961038961038961e-07, "loss": 2.1341, "step": 61 }, { "epoch": 0.12088715573970266, "grad_norm": 0.27546533942222595, "learning_rate": 4.025974025974026e-07, "loss": 2.0903, "step": 62 }, { "epoch": 0.12283694857421398, "grad_norm": 0.2821657657623291, "learning_rate": 4.090909090909091e-07, "loss": 2.1028, "step": 63 }, { "epoch": 0.12478674140872532, "grad_norm": 0.2886088788509369, "learning_rate": 4.155844155844156e-07, "loss": 2.0685, "step": 64 }, { "epoch": 0.12673653424323666, "grad_norm": 0.3001558482646942, "learning_rate": 4.22077922077922e-07, "loss": 2.0996, "step": 65 }, { "epoch": 0.128686327077748, "grad_norm": 0.24933473765850067, "learning_rate": 4.285714285714285e-07, "loss": 2.0242, "step": 66 }, { "epoch": 0.13063611991225932, "grad_norm": 0.27868619561195374, "learning_rate": 4.35064935064935e-07, "loss": 2.0535, "step": 67 }, { "epoch": 0.13258591274677065, "grad_norm": 0.29242217540740967, "learning_rate": 4.415584415584415e-07, "loss": 2.0379, "step": 68 }, { "epoch": 0.134535705581282, "grad_norm": 0.2707277536392212, "learning_rate": 4.48051948051948e-07, "loss": 2.0922, "step": 69 }, { "epoch": 0.13648549841579333, "grad_norm": 0.2940627336502075, "learning_rate": 4.545454545454545e-07, "loss": 2.0857, "step": 70 }, { "epoch": 0.13843529125030465, "grad_norm": 0.25989463925361633, "learning_rate": 4.61038961038961e-07, "loss": 2.0664, "step": 71 }, { "epoch": 0.14038508408481598, "grad_norm": 0.2827669382095337, "learning_rate": 4.675324675324675e-07, "loss": 2.0804, "step": 72 }, { "epoch": 0.1423348769193273, "grad_norm": 0.2898445725440979, "learning_rate": 4.7402597402597397e-07, "loss": 2.1116, "step": 73 }, { "epoch": 0.14428466975383866, "grad_norm": 0.2953305244445801, "learning_rate": 4.805194805194805e-07, "loss": 2.0997, "step": 74 }, { "epoch": 0.14623446258835, "grad_norm": 0.28880831599235535, "learning_rate": 4.87012987012987e-07, "loss": 2.0695, "step": 75 }, { "epoch": 0.14818425542286132, "grad_norm": 0.2893301844596863, "learning_rate": 4.935064935064935e-07, "loss": 2.1663, "step": 76 }, { "epoch": 0.15013404825737264, "grad_norm": 0.27863314747810364, "learning_rate": 5e-07, "loss": 2.0468, "step": 77 }, { "epoch": 0.152083841091884, "grad_norm": 0.27849143743515015, "learning_rate": 4.996572995202193e-07, "loss": 2.0909, "step": 78 }, { "epoch": 0.15403363392639532, "grad_norm": 0.2688325345516205, "learning_rate": 4.993145990404387e-07, "loss": 2.1058, "step": 79 }, { "epoch": 0.15598342676090665, "grad_norm": 0.2714349627494812, "learning_rate": 4.989718985606579e-07, "loss": 2.0719, "step": 80 }, { "epoch": 0.15793321959541798, "grad_norm": 0.267674058675766, "learning_rate": 4.986291980808773e-07, "loss": 2.003, "step": 81 }, { "epoch": 0.15988301242992933, "grad_norm": 0.26871585845947266, "learning_rate": 4.982864976010966e-07, "loss": 2.0506, "step": 82 }, { "epoch": 0.16183280526444066, "grad_norm": 0.27725961804389954, "learning_rate": 4.97943797121316e-07, "loss": 2.0908, "step": 83 }, { "epoch": 0.163782598098952, "grad_norm": 0.26912689208984375, "learning_rate": 4.976010966415353e-07, "loss": 2.1065, "step": 84 }, { "epoch": 0.1657323909334633, "grad_norm": 0.26862508058547974, "learning_rate": 4.972583961617545e-07, "loss": 2.0017, "step": 85 }, { "epoch": 0.16768218376797464, "grad_norm": 0.2780780792236328, "learning_rate": 4.969156956819739e-07, "loss": 2.0812, "step": 86 }, { "epoch": 0.169631976602486, "grad_norm": 0.2691902816295624, "learning_rate": 4.965729952021932e-07, "loss": 2.108, "step": 87 }, { "epoch": 0.17158176943699732, "grad_norm": 0.25564315915107727, "learning_rate": 4.962302947224126e-07, "loss": 2.0141, "step": 88 }, { "epoch": 0.17353156227150865, "grad_norm": 0.29978710412979126, "learning_rate": 4.958875942426319e-07, "loss": 2.1087, "step": 89 }, { "epoch": 0.17548135510601998, "grad_norm": 0.26945438981056213, "learning_rate": 4.955448937628513e-07, "loss": 2.0654, "step": 90 }, { "epoch": 0.17743114794053133, "grad_norm": 0.2857602834701538, "learning_rate": 4.952021932830705e-07, "loss": 2.0258, "step": 91 }, { "epoch": 0.17938094077504266, "grad_norm": 0.3205603063106537, "learning_rate": 4.948594928032899e-07, "loss": 2.0839, "step": 92 }, { "epoch": 0.18133073360955398, "grad_norm": 0.29022127389907837, "learning_rate": 4.945167923235092e-07, "loss": 2.063, "step": 93 }, { "epoch": 0.1832805264440653, "grad_norm": 0.2677106559276581, "learning_rate": 4.941740918437286e-07, "loss": 2.0257, "step": 94 }, { "epoch": 0.18523031927857664, "grad_norm": 0.2686716318130493, "learning_rate": 4.938313913639479e-07, "loss": 2.053, "step": 95 }, { "epoch": 0.187180112113088, "grad_norm": 0.3096849322319031, "learning_rate": 4.934886908841673e-07, "loss": 2.0954, "step": 96 }, { "epoch": 0.18912990494759932, "grad_norm": 0.29678693413734436, "learning_rate": 4.931459904043865e-07, "loss": 2.0984, "step": 97 }, { "epoch": 0.19107969778211065, "grad_norm": 0.29280567169189453, "learning_rate": 4.928032899246059e-07, "loss": 2.1523, "step": 98 }, { "epoch": 0.19302949061662197, "grad_norm": 0.33339405059814453, "learning_rate": 4.924605894448252e-07, "loss": 2.1537, "step": 99 }, { "epoch": 0.19497928345113333, "grad_norm": 0.2959805727005005, "learning_rate": 4.921178889650445e-07, "loss": 2.07, "step": 100 }, { "epoch": 0.19692907628564466, "grad_norm": 0.2850833535194397, "learning_rate": 4.917751884852638e-07, "loss": 2.0565, "step": 101 }, { "epoch": 0.19887886912015598, "grad_norm": 0.27677983045578003, "learning_rate": 4.914324880054832e-07, "loss": 2.0252, "step": 102 }, { "epoch": 0.2008286619546673, "grad_norm": 0.2881922423839569, "learning_rate": 4.910897875257025e-07, "loss": 2.1085, "step": 103 }, { "epoch": 0.20277845478917864, "grad_norm": 0.28352612257003784, "learning_rate": 4.907470870459218e-07, "loss": 2.0758, "step": 104 }, { "epoch": 0.20472824762369, "grad_norm": 0.2815571427345276, "learning_rate": 4.904043865661412e-07, "loss": 2.0588, "step": 105 }, { "epoch": 0.20667804045820132, "grad_norm": 0.2817777395248413, "learning_rate": 4.900616860863605e-07, "loss": 2.0751, "step": 106 }, { "epoch": 0.20862783329271264, "grad_norm": 0.29829949140548706, "learning_rate": 4.897189856065798e-07, "loss": 2.0505, "step": 107 }, { "epoch": 0.21057762612722397, "grad_norm": 0.2886929214000702, "learning_rate": 4.893762851267992e-07, "loss": 2.028, "step": 108 }, { "epoch": 0.21252741896173533, "grad_norm": 0.28375059366226196, "learning_rate": 4.890335846470185e-07, "loss": 2.0282, "step": 109 }, { "epoch": 0.21447721179624665, "grad_norm": 0.27930572628974915, "learning_rate": 4.886908841672378e-07, "loss": 2.1027, "step": 110 }, { "epoch": 0.21642700463075798, "grad_norm": 0.27910512685775757, "learning_rate": 4.883481836874572e-07, "loss": 2.1146, "step": 111 }, { "epoch": 0.2183767974652693, "grad_norm": 0.286739319562912, "learning_rate": 4.880054832076765e-07, "loss": 2.0727, "step": 112 }, { "epoch": 0.22032659029978066, "grad_norm": 0.2716750502586365, "learning_rate": 4.876627827278957e-07, "loss": 2.02, "step": 113 }, { "epoch": 0.222276383134292, "grad_norm": 0.28050121665000916, "learning_rate": 4.873200822481151e-07, "loss": 1.9912, "step": 114 }, { "epoch": 0.22422617596880332, "grad_norm": 0.31914082169532776, "learning_rate": 4.869773817683344e-07, "loss": 2.0654, "step": 115 }, { "epoch": 0.22617596880331464, "grad_norm": 0.3212663233280182, "learning_rate": 4.866346812885538e-07, "loss": 2.1145, "step": 116 }, { "epoch": 0.22812576163782597, "grad_norm": 0.3040018081665039, "learning_rate": 4.862919808087731e-07, "loss": 2.1285, "step": 117 }, { "epoch": 0.23007555447233732, "grad_norm": 0.3013773560523987, "learning_rate": 4.859492803289925e-07, "loss": 2.0631, "step": 118 }, { "epoch": 0.23202534730684865, "grad_norm": 0.2854544520378113, "learning_rate": 4.856065798492117e-07, "loss": 2.0701, "step": 119 }, { "epoch": 0.23397514014135998, "grad_norm": 0.27997076511383057, "learning_rate": 4.852638793694311e-07, "loss": 1.9768, "step": 120 }, { "epoch": 0.2359249329758713, "grad_norm": 0.2790175974369049, "learning_rate": 4.849211788896504e-07, "loss": 2.0499, "step": 121 }, { "epoch": 0.23787472581038266, "grad_norm": 0.28126639127731323, "learning_rate": 4.845784784098698e-07, "loss": 2.0691, "step": 122 }, { "epoch": 0.23982451864489399, "grad_norm": 0.32007864117622375, "learning_rate": 4.842357779300891e-07, "loss": 2.0886, "step": 123 }, { "epoch": 0.2417743114794053, "grad_norm": 0.3017228841781616, "learning_rate": 4.838930774503084e-07, "loss": 2.0796, "step": 124 }, { "epoch": 0.24372410431391664, "grad_norm": 0.28364625573158264, "learning_rate": 4.835503769705277e-07, "loss": 2.0737, "step": 125 }, { "epoch": 0.24567389714842797, "grad_norm": 0.3120713233947754, "learning_rate": 4.83207676490747e-07, "loss": 2.0741, "step": 126 }, { "epoch": 0.24762368998293932, "grad_norm": 0.293863445520401, "learning_rate": 4.828649760109664e-07, "loss": 1.9777, "step": 127 }, { "epoch": 0.24957348281745065, "grad_norm": 0.2932412326335907, "learning_rate": 4.825222755311857e-07, "loss": 2.0567, "step": 128 }, { "epoch": 0.251523275651962, "grad_norm": 0.29689502716064453, "learning_rate": 4.821795750514051e-07, "loss": 2.0251, "step": 129 }, { "epoch": 0.25347306848647333, "grad_norm": 0.2953934669494629, "learning_rate": 4.818368745716243e-07, "loss": 2.0826, "step": 130 }, { "epoch": 0.25542286132098463, "grad_norm": 0.29008495807647705, "learning_rate": 4.814941740918437e-07, "loss": 1.9974, "step": 131 }, { "epoch": 0.257372654155496, "grad_norm": 0.29402440786361694, "learning_rate": 4.81151473612063e-07, "loss": 2.1115, "step": 132 }, { "epoch": 0.25932244699000734, "grad_norm": 0.313650906085968, "learning_rate": 4.808087731322824e-07, "loss": 2.0834, "step": 133 }, { "epoch": 0.26127223982451864, "grad_norm": 0.2968846261501312, "learning_rate": 4.804660726525017e-07, "loss": 2.0786, "step": 134 }, { "epoch": 0.26322203265903, "grad_norm": 0.30427923798561096, "learning_rate": 4.801233721727211e-07, "loss": 1.9974, "step": 135 }, { "epoch": 0.2651718254935413, "grad_norm": 0.3112437129020691, "learning_rate": 4.797806716929403e-07, "loss": 2.0837, "step": 136 }, { "epoch": 0.26712161832805265, "grad_norm": 0.30960723757743835, "learning_rate": 4.794379712131597e-07, "loss": 2.1307, "step": 137 }, { "epoch": 0.269071411162564, "grad_norm": 0.3101617097854614, "learning_rate": 4.79095270733379e-07, "loss": 2.0395, "step": 138 }, { "epoch": 0.2710212039970753, "grad_norm": 0.2995094358921051, "learning_rate": 4.787525702535984e-07, "loss": 2.0844, "step": 139 }, { "epoch": 0.27297099683158665, "grad_norm": 0.29981735348701477, "learning_rate": 4.784098697738176e-07, "loss": 2.0474, "step": 140 }, { "epoch": 0.27492078966609795, "grad_norm": 0.29965049028396606, "learning_rate": 4.78067169294037e-07, "loss": 2.0664, "step": 141 }, { "epoch": 0.2768705825006093, "grad_norm": 0.31631559133529663, "learning_rate": 4.777244688142563e-07, "loss": 2.0932, "step": 142 }, { "epoch": 0.27882037533512066, "grad_norm": 0.32392817735671997, "learning_rate": 4.773817683344756e-07, "loss": 2.0404, "step": 143 }, { "epoch": 0.28077016816963196, "grad_norm": 0.2919900715351105, "learning_rate": 4.77039067854695e-07, "loss": 2.0367, "step": 144 }, { "epoch": 0.2827199610041433, "grad_norm": 0.3037238121032715, "learning_rate": 4.7669636737491434e-07, "loss": 2.0741, "step": 145 }, { "epoch": 0.2846697538386546, "grad_norm": 0.2894318997859955, "learning_rate": 4.7635366689513363e-07, "loss": 2.0676, "step": 146 }, { "epoch": 0.28661954667316597, "grad_norm": 0.3007095158100128, "learning_rate": 4.760109664153529e-07, "loss": 2.051, "step": 147 }, { "epoch": 0.2885693395076773, "grad_norm": 0.31736671924591064, "learning_rate": 4.756682659355723e-07, "loss": 2.0587, "step": 148 }, { "epoch": 0.2905191323421886, "grad_norm": 0.3223492503166199, "learning_rate": 4.753255654557916e-07, "loss": 2.0884, "step": 149 }, { "epoch": 0.2924689251767, "grad_norm": 0.31644171476364136, "learning_rate": 4.749828649760109e-07, "loss": 2.128, "step": 150 }, { "epoch": 0.29441871801121133, "grad_norm": 0.3055993914604187, "learning_rate": 4.746401644962303e-07, "loss": 2.0597, "step": 151 }, { "epoch": 0.29636851084572263, "grad_norm": 0.3014571964740753, "learning_rate": 4.742974640164496e-07, "loss": 2.0674, "step": 152 }, { "epoch": 0.298318303680234, "grad_norm": 0.33088865876197815, "learning_rate": 4.739547635366689e-07, "loss": 2.0636, "step": 153 }, { "epoch": 0.3002680965147453, "grad_norm": 0.3139593005180359, "learning_rate": 4.736120630568883e-07, "loss": 2.0674, "step": 154 }, { "epoch": 0.30221788934925664, "grad_norm": 0.31804022192955017, "learning_rate": 4.732693625771076e-07, "loss": 2.1092, "step": 155 }, { "epoch": 0.304167682183768, "grad_norm": 0.34043845534324646, "learning_rate": 4.729266620973269e-07, "loss": 2.0391, "step": 156 }, { "epoch": 0.3061174750182793, "grad_norm": 0.34768176078796387, "learning_rate": 4.725839616175463e-07, "loss": 2.0984, "step": 157 }, { "epoch": 0.30806726785279065, "grad_norm": 0.30159029364585876, "learning_rate": 4.722412611377656e-07, "loss": 2.0085, "step": 158 }, { "epoch": 0.31001706068730195, "grad_norm": 0.3267905116081238, "learning_rate": 4.718985606579849e-07, "loss": 2.0719, "step": 159 }, { "epoch": 0.3119668535218133, "grad_norm": 0.3086291551589966, "learning_rate": 4.715558601782042e-07, "loss": 2.0928, "step": 160 }, { "epoch": 0.31391664635632466, "grad_norm": 0.30459094047546387, "learning_rate": 4.712131596984236e-07, "loss": 2.1044, "step": 161 }, { "epoch": 0.31586643919083596, "grad_norm": 0.2868260443210602, "learning_rate": 4.7087045921864287e-07, "loss": 2.0631, "step": 162 }, { "epoch": 0.3178162320253473, "grad_norm": 0.3526155650615692, "learning_rate": 4.7052775873886217e-07, "loss": 2.0573, "step": 163 }, { "epoch": 0.31976602485985867, "grad_norm": 0.3164813220500946, "learning_rate": 4.7018505825908157e-07, "loss": 2.1207, "step": 164 }, { "epoch": 0.32171581769436997, "grad_norm": 0.3223491907119751, "learning_rate": 4.6984235777930086e-07, "loss": 2.089, "step": 165 }, { "epoch": 0.3236656105288813, "grad_norm": 0.3313138484954834, "learning_rate": 4.6949965729952016e-07, "loss": 2.0777, "step": 166 }, { "epoch": 0.3256154033633926, "grad_norm": 0.3372494876384735, "learning_rate": 4.6915695681973956e-07, "loss": 2.0185, "step": 167 }, { "epoch": 0.327565196197904, "grad_norm": 0.3191705346107483, "learning_rate": 4.6881425633995885e-07, "loss": 2.0505, "step": 168 }, { "epoch": 0.32951498903241533, "grad_norm": 0.32238319516181946, "learning_rate": 4.6847155586017815e-07, "loss": 2.126, "step": 169 }, { "epoch": 0.3314647818669266, "grad_norm": 0.31298163533210754, "learning_rate": 4.6812885538039755e-07, "loss": 2.1064, "step": 170 }, { "epoch": 0.333414574701438, "grad_norm": 0.3096555471420288, "learning_rate": 4.6778615490061684e-07, "loss": 2.0649, "step": 171 }, { "epoch": 0.3353643675359493, "grad_norm": 0.3024272620677948, "learning_rate": 4.6744345442083614e-07, "loss": 2.0508, "step": 172 }, { "epoch": 0.33731416037046064, "grad_norm": 0.3325616419315338, "learning_rate": 4.671007539410555e-07, "loss": 2.1431, "step": 173 }, { "epoch": 0.339263953204972, "grad_norm": 0.3665126860141754, "learning_rate": 4.6675805346127483e-07, "loss": 2.1174, "step": 174 }, { "epoch": 0.3412137460394833, "grad_norm": 0.3292168378829956, "learning_rate": 4.664153529814941e-07, "loss": 2.1029, "step": 175 }, { "epoch": 0.34316353887399464, "grad_norm": 0.3286147713661194, "learning_rate": 4.6607265250171347e-07, "loss": 2.1042, "step": 176 }, { "epoch": 0.34511333170850594, "grad_norm": 0.32417264580726624, "learning_rate": 4.657299520219328e-07, "loss": 2.0901, "step": 177 }, { "epoch": 0.3470631245430173, "grad_norm": 0.31667739152908325, "learning_rate": 4.653872515421521e-07, "loss": 2.0895, "step": 178 }, { "epoch": 0.34901291737752865, "grad_norm": 0.3280418813228607, "learning_rate": 4.6504455106237146e-07, "loss": 2.1237, "step": 179 }, { "epoch": 0.35096271021203995, "grad_norm": 0.32828444242477417, "learning_rate": 4.647018505825908e-07, "loss": 2.0933, "step": 180 }, { "epoch": 0.3529125030465513, "grad_norm": 0.3365094065666199, "learning_rate": 4.643591501028101e-07, "loss": 2.1049, "step": 181 }, { "epoch": 0.35486229588106266, "grad_norm": 0.3169403076171875, "learning_rate": 4.6401644962302945e-07, "loss": 2.0636, "step": 182 }, { "epoch": 0.35681208871557396, "grad_norm": 0.31843212246894836, "learning_rate": 4.636737491432488e-07, "loss": 2.0744, "step": 183 }, { "epoch": 0.3587618815500853, "grad_norm": 0.34016114473342896, "learning_rate": 4.633310486634681e-07, "loss": 2.0572, "step": 184 }, { "epoch": 0.3607116743845966, "grad_norm": 0.3435775935649872, "learning_rate": 4.6298834818368744e-07, "loss": 2.0702, "step": 185 }, { "epoch": 0.36266146721910797, "grad_norm": 0.32756081223487854, "learning_rate": 4.6264564770390674e-07, "loss": 2.0219, "step": 186 }, { "epoch": 0.3646112600536193, "grad_norm": 0.3173263370990753, "learning_rate": 4.623029472241261e-07, "loss": 2.0134, "step": 187 }, { "epoch": 0.3665610528881306, "grad_norm": 0.33062443137168884, "learning_rate": 4.6196024674434543e-07, "loss": 2.0508, "step": 188 }, { "epoch": 0.368510845722642, "grad_norm": 0.3294820785522461, "learning_rate": 4.616175462645647e-07, "loss": 1.9935, "step": 189 }, { "epoch": 0.3704606385571533, "grad_norm": 0.3417966663837433, "learning_rate": 4.6127484578478407e-07, "loss": 2.0486, "step": 190 }, { "epoch": 0.37241043139166463, "grad_norm": 0.35238054394721985, "learning_rate": 4.609321453050034e-07, "loss": 2.0854, "step": 191 }, { "epoch": 0.374360224226176, "grad_norm": 0.3305458426475525, "learning_rate": 4.605894448252227e-07, "loss": 2.0449, "step": 192 }, { "epoch": 0.3763100170606873, "grad_norm": 0.324318528175354, "learning_rate": 4.6024674434544206e-07, "loss": 2.1153, "step": 193 }, { "epoch": 0.37825980989519864, "grad_norm": 0.3373543322086334, "learning_rate": 4.599040438656614e-07, "loss": 2.0677, "step": 194 }, { "epoch": 0.38020960272971, "grad_norm": 0.345115602016449, "learning_rate": 4.595613433858807e-07, "loss": 2.0312, "step": 195 }, { "epoch": 0.3821593955642213, "grad_norm": 0.3340489864349365, "learning_rate": 4.5921864290610005e-07, "loss": 1.9848, "step": 196 }, { "epoch": 0.38410918839873265, "grad_norm": 0.3615861237049103, "learning_rate": 4.588759424263194e-07, "loss": 2.0471, "step": 197 }, { "epoch": 0.38605898123324395, "grad_norm": 0.3380940854549408, "learning_rate": 4.585332419465387e-07, "loss": 2.0481, "step": 198 }, { "epoch": 0.3880087740677553, "grad_norm": 0.3478194773197174, "learning_rate": 4.58190541466758e-07, "loss": 2.0324, "step": 199 }, { "epoch": 0.38995856690226666, "grad_norm": 0.34738266468048096, "learning_rate": 4.578478409869774e-07, "loss": 2.0864, "step": 200 }, { "epoch": 0.39190835973677796, "grad_norm": 0.3694723844528198, "learning_rate": 4.575051405071967e-07, "loss": 2.1574, "step": 201 }, { "epoch": 0.3938581525712893, "grad_norm": 0.3413209617137909, "learning_rate": 4.57162440027416e-07, "loss": 2.067, "step": 202 }, { "epoch": 0.3958079454058006, "grad_norm": 0.3256085515022278, "learning_rate": 4.568197395476354e-07, "loss": 2.0749, "step": 203 }, { "epoch": 0.39775773824031196, "grad_norm": 0.3281763792037964, "learning_rate": 4.5647703906785467e-07, "loss": 2.0431, "step": 204 }, { "epoch": 0.3997075310748233, "grad_norm": 0.3446051776409149, "learning_rate": 4.5613433858807397e-07, "loss": 2.011, "step": 205 }, { "epoch": 0.4016573239093346, "grad_norm": 0.3425387442111969, "learning_rate": 4.5579163810829337e-07, "loss": 2.0987, "step": 206 }, { "epoch": 0.403607116743846, "grad_norm": 0.33923473954200745, "learning_rate": 4.5544893762851266e-07, "loss": 2.0777, "step": 207 }, { "epoch": 0.40555690957835727, "grad_norm": 0.34710973501205444, "learning_rate": 4.5510623714873196e-07, "loss": 2.0662, "step": 208 }, { "epoch": 0.4075067024128686, "grad_norm": 0.33852049708366394, "learning_rate": 4.5476353666895136e-07, "loss": 2.0872, "step": 209 }, { "epoch": 0.40945649524738, "grad_norm": 0.342153400182724, "learning_rate": 4.5442083618917065e-07, "loss": 2.0414, "step": 210 }, { "epoch": 0.4114062880818913, "grad_norm": 0.34867721796035767, "learning_rate": 4.5407813570938995e-07, "loss": 2.1128, "step": 211 }, { "epoch": 0.41335608091640264, "grad_norm": 0.33942094445228577, "learning_rate": 4.537354352296093e-07, "loss": 2.0786, "step": 212 }, { "epoch": 0.415305873750914, "grad_norm": 0.33538249135017395, "learning_rate": 4.5339273474982864e-07, "loss": 2.0332, "step": 213 }, { "epoch": 0.4172556665854253, "grad_norm": 0.34453144669532776, "learning_rate": 4.5305003427004794e-07, "loss": 2.0629, "step": 214 }, { "epoch": 0.41920545941993664, "grad_norm": 0.35166001319885254, "learning_rate": 4.527073337902673e-07, "loss": 2.0881, "step": 215 }, { "epoch": 0.42115525225444794, "grad_norm": 0.3170466721057892, "learning_rate": 4.5236463331048663e-07, "loss": 2.0508, "step": 216 }, { "epoch": 0.4231050450889593, "grad_norm": 0.3201327919960022, "learning_rate": 4.520219328307059e-07, "loss": 2.0147, "step": 217 }, { "epoch": 0.42505483792347065, "grad_norm": 0.34361732006073, "learning_rate": 4.5167923235092527e-07, "loss": 2.084, "step": 218 }, { "epoch": 0.42700463075798195, "grad_norm": 0.3500427305698395, "learning_rate": 4.513365318711446e-07, "loss": 2.0568, "step": 219 }, { "epoch": 0.4289544235924933, "grad_norm": 0.34151604771614075, "learning_rate": 4.509938313913639e-07, "loss": 2.0366, "step": 220 }, { "epoch": 0.4309042164270046, "grad_norm": 0.3297358751296997, "learning_rate": 4.5065113091158326e-07, "loss": 2.0639, "step": 221 }, { "epoch": 0.43285400926151596, "grad_norm": 0.3623073995113373, "learning_rate": 4.503084304318026e-07, "loss": 2.0477, "step": 222 }, { "epoch": 0.4348038020960273, "grad_norm": 0.34618520736694336, "learning_rate": 4.499657299520219e-07, "loss": 2.1036, "step": 223 }, { "epoch": 0.4367535949305386, "grad_norm": 0.3289443850517273, "learning_rate": 4.4962302947224125e-07, "loss": 2.0026, "step": 224 }, { "epoch": 0.43870338776504997, "grad_norm": 0.3390786349773407, "learning_rate": 4.4928032899246055e-07, "loss": 2.0208, "step": 225 }, { "epoch": 0.4406531805995613, "grad_norm": 0.3597511351108551, "learning_rate": 4.489376285126799e-07, "loss": 2.1259, "step": 226 }, { "epoch": 0.4426029734340726, "grad_norm": 0.3647196888923645, "learning_rate": 4.4859492803289924e-07, "loss": 2.1048, "step": 227 }, { "epoch": 0.444552766268584, "grad_norm": 0.35180747509002686, "learning_rate": 4.4825222755311854e-07, "loss": 2.0439, "step": 228 }, { "epoch": 0.4465025591030953, "grad_norm": 0.35504230856895447, "learning_rate": 4.479095270733379e-07, "loss": 2.0845, "step": 229 }, { "epoch": 0.44845235193760663, "grad_norm": 0.3500707447528839, "learning_rate": 4.4756682659355723e-07, "loss": 2.0717, "step": 230 }, { "epoch": 0.450402144772118, "grad_norm": 0.34788116812705994, "learning_rate": 4.472241261137765e-07, "loss": 2.1076, "step": 231 }, { "epoch": 0.4523519376066293, "grad_norm": 0.3553301990032196, "learning_rate": 4.4688142563399587e-07, "loss": 2.0512, "step": 232 }, { "epoch": 0.45430173044114064, "grad_norm": 0.3606579005718231, "learning_rate": 4.465387251542152e-07, "loss": 2.1154, "step": 233 }, { "epoch": 0.45625152327565194, "grad_norm": 0.3678739368915558, "learning_rate": 4.461960246744345e-07, "loss": 2.0755, "step": 234 }, { "epoch": 0.4582013161101633, "grad_norm": 0.3320152461528778, "learning_rate": 4.4585332419465386e-07, "loss": 2.0402, "step": 235 }, { "epoch": 0.46015110894467465, "grad_norm": 0.3439280688762665, "learning_rate": 4.455106237148732e-07, "loss": 2.0674, "step": 236 }, { "epoch": 0.46210090177918595, "grad_norm": 0.34789469838142395, "learning_rate": 4.451679232350925e-07, "loss": 2.0616, "step": 237 }, { "epoch": 0.4640506946136973, "grad_norm": 0.35700955986976624, "learning_rate": 4.448252227553118e-07, "loss": 2.0678, "step": 238 }, { "epoch": 0.4660004874482086, "grad_norm": 0.33981651067733765, "learning_rate": 4.444825222755312e-07, "loss": 2.0552, "step": 239 }, { "epoch": 0.46795028028271995, "grad_norm": 0.36125004291534424, "learning_rate": 4.441398217957505e-07, "loss": 2.0739, "step": 240 }, { "epoch": 0.4699000731172313, "grad_norm": 0.3675917088985443, "learning_rate": 4.437971213159698e-07, "loss": 2.0341, "step": 241 }, { "epoch": 0.4718498659517426, "grad_norm": 0.36773043870925903, "learning_rate": 4.434544208361892e-07, "loss": 2.1091, "step": 242 }, { "epoch": 0.47379965878625396, "grad_norm": 0.34321659803390503, "learning_rate": 4.431117203564085e-07, "loss": 2.0189, "step": 243 }, { "epoch": 0.4757494516207653, "grad_norm": 0.36672836542129517, "learning_rate": 4.427690198766278e-07, "loss": 2.064, "step": 244 }, { "epoch": 0.4776992444552766, "grad_norm": 0.3681386411190033, "learning_rate": 4.424263193968472e-07, "loss": 2.0895, "step": 245 }, { "epoch": 0.47964903728978797, "grad_norm": 0.36538165807724, "learning_rate": 4.4208361891706647e-07, "loss": 2.0361, "step": 246 }, { "epoch": 0.48159883012429927, "grad_norm": 0.3780750036239624, "learning_rate": 4.4174091843728577e-07, "loss": 2.053, "step": 247 }, { "epoch": 0.4835486229588106, "grad_norm": 0.3471691310405731, "learning_rate": 4.4139821795750517e-07, "loss": 2.0051, "step": 248 }, { "epoch": 0.485498415793322, "grad_norm": 0.36653193831443787, "learning_rate": 4.4105551747772446e-07, "loss": 2.1492, "step": 249 }, { "epoch": 0.4874482086278333, "grad_norm": 0.37775489687919617, "learning_rate": 4.4071281699794376e-07, "loss": 2.0406, "step": 250 }, { "epoch": 0.48939800146234463, "grad_norm": 0.3678765892982483, "learning_rate": 4.403701165181631e-07, "loss": 2.0804, "step": 251 }, { "epoch": 0.49134779429685593, "grad_norm": 0.3415094316005707, "learning_rate": 4.4002741603838245e-07, "loss": 2.0187, "step": 252 }, { "epoch": 0.4932975871313673, "grad_norm": 0.3463176190853119, "learning_rate": 4.3968471555860175e-07, "loss": 2.0618, "step": 253 }, { "epoch": 0.49524737996587864, "grad_norm": 0.3565087616443634, "learning_rate": 4.393420150788211e-07, "loss": 2.0809, "step": 254 }, { "epoch": 0.49719717280038994, "grad_norm": 0.3863977789878845, "learning_rate": 4.3899931459904044e-07, "loss": 2.038, "step": 255 }, { "epoch": 0.4991469656349013, "grad_norm": 0.3344396948814392, "learning_rate": 4.3865661411925974e-07, "loss": 2.071, "step": 256 }, { "epoch": 0.5010967584694126, "grad_norm": 0.3676479160785675, "learning_rate": 4.383139136394791e-07, "loss": 2.0469, "step": 257 }, { "epoch": 0.503046551303924, "grad_norm": 0.36381298303604126, "learning_rate": 4.3797121315969843e-07, "loss": 2.0795, "step": 258 }, { "epoch": 0.5049963441384353, "grad_norm": 0.3515491783618927, "learning_rate": 4.376285126799177e-07, "loss": 1.9912, "step": 259 }, { "epoch": 0.5069461369729467, "grad_norm": 0.3699260354042053, "learning_rate": 4.3728581220013707e-07, "loss": 2.0829, "step": 260 }, { "epoch": 0.5088959298074579, "grad_norm": 0.39030641317367554, "learning_rate": 4.3694311172035637e-07, "loss": 2.0917, "step": 261 }, { "epoch": 0.5108457226419693, "grad_norm": 0.35085543990135193, "learning_rate": 4.366004112405757e-07, "loss": 2.0517, "step": 262 }, { "epoch": 0.5127955154764806, "grad_norm": 0.3542785048484802, "learning_rate": 4.3625771076079506e-07, "loss": 2.0519, "step": 263 }, { "epoch": 0.514745308310992, "grad_norm": 0.36474236845970154, "learning_rate": 4.3591501028101436e-07, "loss": 1.9739, "step": 264 }, { "epoch": 0.5166951011455033, "grad_norm": 0.37260621786117554, "learning_rate": 4.355723098012337e-07, "loss": 1.9897, "step": 265 }, { "epoch": 0.5186448939800147, "grad_norm": 0.3556238114833832, "learning_rate": 4.3522960932145305e-07, "loss": 2.0196, "step": 266 }, { "epoch": 0.5205946868145259, "grad_norm": 0.36310216784477234, "learning_rate": 4.3488690884167235e-07, "loss": 2.0151, "step": 267 }, { "epoch": 0.5225444796490373, "grad_norm": 0.37483158707618713, "learning_rate": 4.345442083618917e-07, "loss": 2.0929, "step": 268 }, { "epoch": 0.5244942724835486, "grad_norm": 0.3717723786830902, "learning_rate": 4.3420150788211104e-07, "loss": 2.1377, "step": 269 }, { "epoch": 0.52644406531806, "grad_norm": 0.34406736493110657, "learning_rate": 4.3385880740233034e-07, "loss": 2.0109, "step": 270 }, { "epoch": 0.5283938581525713, "grad_norm": 0.37034499645233154, "learning_rate": 4.335161069225497e-07, "loss": 2.0867, "step": 271 }, { "epoch": 0.5303436509870826, "grad_norm": 0.3672201931476593, "learning_rate": 4.3317340644276903e-07, "loss": 2.0828, "step": 272 }, { "epoch": 0.5322934438215939, "grad_norm": 0.3954712152481079, "learning_rate": 4.328307059629883e-07, "loss": 2.0625, "step": 273 }, { "epoch": 0.5342432366561053, "grad_norm": 0.35529398918151855, "learning_rate": 4.324880054832076e-07, "loss": 2.1149, "step": 274 }, { "epoch": 0.5361930294906166, "grad_norm": 0.34687867760658264, "learning_rate": 4.32145305003427e-07, "loss": 2.0161, "step": 275 }, { "epoch": 0.538142822325128, "grad_norm": 0.3687521815299988, "learning_rate": 4.318026045236463e-07, "loss": 2.0979, "step": 276 }, { "epoch": 0.5400926151596392, "grad_norm": 0.36186617612838745, "learning_rate": 4.314599040438656e-07, "loss": 2.0323, "step": 277 }, { "epoch": 0.5420424079941506, "grad_norm": 0.34530189633369446, "learning_rate": 4.31117203564085e-07, "loss": 2.0252, "step": 278 }, { "epoch": 0.543992200828662, "grad_norm": 0.36403632164001465, "learning_rate": 4.307745030843043e-07, "loss": 2.0518, "step": 279 }, { "epoch": 0.5459419936631733, "grad_norm": 0.4035261869430542, "learning_rate": 4.304318026045236e-07, "loss": 2.1648, "step": 280 }, { "epoch": 0.5478917864976847, "grad_norm": 0.36672019958496094, "learning_rate": 4.30089102124743e-07, "loss": 2.0564, "step": 281 }, { "epoch": 0.5498415793321959, "grad_norm": 0.386877179145813, "learning_rate": 4.297464016449623e-07, "loss": 2.0859, "step": 282 }, { "epoch": 0.5517913721667073, "grad_norm": 0.38155534863471985, "learning_rate": 4.294037011651816e-07, "loss": 2.0828, "step": 283 }, { "epoch": 0.5537411650012186, "grad_norm": 0.3724847435951233, "learning_rate": 4.29061000685401e-07, "loss": 2.167, "step": 284 }, { "epoch": 0.55569095783573, "grad_norm": 0.3812715411186218, "learning_rate": 4.287183002056203e-07, "loss": 2.0624, "step": 285 }, { "epoch": 0.5576407506702413, "grad_norm": 0.365509569644928, "learning_rate": 4.283755997258396e-07, "loss": 2.0324, "step": 286 }, { "epoch": 0.5595905435047526, "grad_norm": 0.3624550700187683, "learning_rate": 4.2803289924605887e-07, "loss": 2.0274, "step": 287 }, { "epoch": 0.5615403363392639, "grad_norm": 0.38429534435272217, "learning_rate": 4.2769019876627827e-07, "loss": 2.0031, "step": 288 }, { "epoch": 0.5634901291737753, "grad_norm": 0.3589562773704529, "learning_rate": 4.2734749828649757e-07, "loss": 2.0547, "step": 289 }, { "epoch": 0.5654399220082866, "grad_norm": 0.3625582158565521, "learning_rate": 4.2700479780671686e-07, "loss": 2.0044, "step": 290 }, { "epoch": 0.567389714842798, "grad_norm": 0.37126410007476807, "learning_rate": 4.2666209732693626e-07, "loss": 2.0788, "step": 291 }, { "epoch": 0.5693395076773092, "grad_norm": 0.36942729353904724, "learning_rate": 4.2631939684715556e-07, "loss": 2.0676, "step": 292 }, { "epoch": 0.5712893005118206, "grad_norm": 0.3787277936935425, "learning_rate": 4.2597669636737485e-07, "loss": 2.0491, "step": 293 }, { "epoch": 0.5732390933463319, "grad_norm": 0.3843463957309723, "learning_rate": 4.2563399588759425e-07, "loss": 2.0657, "step": 294 }, { "epoch": 0.5751888861808433, "grad_norm": 0.384347140789032, "learning_rate": 4.2529129540781355e-07, "loss": 2.042, "step": 295 }, { "epoch": 0.5771386790153546, "grad_norm": 0.38822734355926514, "learning_rate": 4.2494859492803284e-07, "loss": 2.1084, "step": 296 }, { "epoch": 0.579088471849866, "grad_norm": 0.3850004971027374, "learning_rate": 4.2460589444825224e-07, "loss": 2.0527, "step": 297 }, { "epoch": 0.5810382646843772, "grad_norm": 0.3730074167251587, "learning_rate": 4.2426319396847154e-07, "loss": 2.0665, "step": 298 }, { "epoch": 0.5829880575188886, "grad_norm": 0.3895587623119354, "learning_rate": 4.2392049348869083e-07, "loss": 2.1166, "step": 299 }, { "epoch": 0.5849378503534, "grad_norm": 0.3875929117202759, "learning_rate": 4.235777930089102e-07, "loss": 2.1165, "step": 300 }, { "epoch": 0.5868876431879113, "grad_norm": 0.36664247512817383, "learning_rate": 4.232350925291295e-07, "loss": 2.0039, "step": 301 }, { "epoch": 0.5888374360224227, "grad_norm": 0.3771498501300812, "learning_rate": 4.228923920493488e-07, "loss": 2.0727, "step": 302 }, { "epoch": 0.5907872288569339, "grad_norm": 0.3995096981525421, "learning_rate": 4.2254969156956817e-07, "loss": 2.0836, "step": 303 }, { "epoch": 0.5927370216914453, "grad_norm": 0.3781261444091797, "learning_rate": 4.222069910897875e-07, "loss": 2.0797, "step": 304 }, { "epoch": 0.5946868145259566, "grad_norm": 0.37572017312049866, "learning_rate": 4.218642906100068e-07, "loss": 2.0363, "step": 305 }, { "epoch": 0.596636607360468, "grad_norm": 0.38773536682128906, "learning_rate": 4.2152159013022616e-07, "loss": 2.0423, "step": 306 }, { "epoch": 0.5985864001949793, "grad_norm": 0.37952083349227905, "learning_rate": 4.211788896504455e-07, "loss": 2.0966, "step": 307 }, { "epoch": 0.6005361930294906, "grad_norm": 0.39403635263442993, "learning_rate": 4.208361891706648e-07, "loss": 2.1212, "step": 308 }, { "epoch": 0.6024859858640019, "grad_norm": 0.382625013589859, "learning_rate": 4.2049348869088415e-07, "loss": 2.0363, "step": 309 }, { "epoch": 0.6044357786985133, "grad_norm": 0.3843761682510376, "learning_rate": 4.201507882111035e-07, "loss": 1.9995, "step": 310 }, { "epoch": 0.6063855715330246, "grad_norm": 0.4082648754119873, "learning_rate": 4.198080877313228e-07, "loss": 2.1265, "step": 311 }, { "epoch": 0.608335364367536, "grad_norm": 0.3746339678764343, "learning_rate": 4.1946538725154214e-07, "loss": 2.0025, "step": 312 }, { "epoch": 0.6102851572020472, "grad_norm": 0.38548338413238525, "learning_rate": 4.1912268677176143e-07, "loss": 2.0764, "step": 313 }, { "epoch": 0.6122349500365586, "grad_norm": 0.3702864944934845, "learning_rate": 4.187799862919808e-07, "loss": 2.0788, "step": 314 }, { "epoch": 0.6141847428710699, "grad_norm": 0.3946288824081421, "learning_rate": 4.184372858122001e-07, "loss": 2.0877, "step": 315 }, { "epoch": 0.6161345357055813, "grad_norm": 0.3777286410331726, "learning_rate": 4.180945853324194e-07, "loss": 1.9863, "step": 316 }, { "epoch": 0.6180843285400927, "grad_norm": 0.40816164016723633, "learning_rate": 4.1775188485263877e-07, "loss": 2.0987, "step": 317 }, { "epoch": 0.6200341213746039, "grad_norm": 0.39065074920654297, "learning_rate": 4.174091843728581e-07, "loss": 2.0629, "step": 318 }, { "epoch": 0.6219839142091153, "grad_norm": 0.38007447123527527, "learning_rate": 4.170664838930774e-07, "loss": 2.0544, "step": 319 }, { "epoch": 0.6239337070436266, "grad_norm": 0.3953652083873749, "learning_rate": 4.1672378341329676e-07, "loss": 2.07, "step": 320 }, { "epoch": 0.625883499878138, "grad_norm": 0.38142332434654236, "learning_rate": 4.163810829335161e-07, "loss": 2.0495, "step": 321 }, { "epoch": 0.6278332927126493, "grad_norm": 0.40484854578971863, "learning_rate": 4.160383824537354e-07, "loss": 2.0341, "step": 322 }, { "epoch": 0.6297830855471606, "grad_norm": 0.4031660556793213, "learning_rate": 4.1569568197395475e-07, "loss": 2.0168, "step": 323 }, { "epoch": 0.6317328783816719, "grad_norm": 0.3859906792640686, "learning_rate": 4.153529814941741e-07, "loss": 2.051, "step": 324 }, { "epoch": 0.6336826712161833, "grad_norm": 0.37458735704421997, "learning_rate": 4.150102810143934e-07, "loss": 2.038, "step": 325 }, { "epoch": 0.6356324640506946, "grad_norm": 0.39573705196380615, "learning_rate": 4.146675805346127e-07, "loss": 2.0308, "step": 326 }, { "epoch": 0.637582256885206, "grad_norm": 0.39273601770401, "learning_rate": 4.143248800548321e-07, "loss": 2.0746, "step": 327 }, { "epoch": 0.6395320497197173, "grad_norm": 0.39438948035240173, "learning_rate": 4.139821795750514e-07, "loss": 2.0568, "step": 328 }, { "epoch": 0.6414818425542286, "grad_norm": 0.3938084840774536, "learning_rate": 4.1363947909527067e-07, "loss": 2.0643, "step": 329 }, { "epoch": 0.6434316353887399, "grad_norm": 0.4020846486091614, "learning_rate": 4.1329677861549007e-07, "loss": 2.0737, "step": 330 }, { "epoch": 0.6453814282232513, "grad_norm": 0.413841187953949, "learning_rate": 4.1295407813570937e-07, "loss": 2.019, "step": 331 }, { "epoch": 0.6473312210577626, "grad_norm": 0.39189133048057556, "learning_rate": 4.1261137765592866e-07, "loss": 2.0795, "step": 332 }, { "epoch": 0.649281013892274, "grad_norm": 0.4119293987751007, "learning_rate": 4.1226867717614806e-07, "loss": 2.0794, "step": 333 }, { "epoch": 0.6512308067267852, "grad_norm": 0.40321430563926697, "learning_rate": 4.1192597669636736e-07, "loss": 2.0249, "step": 334 }, { "epoch": 0.6531805995612966, "grad_norm": 0.39300522208213806, "learning_rate": 4.1158327621658665e-07, "loss": 2.0667, "step": 335 }, { "epoch": 0.655130392395808, "grad_norm": 0.39836639165878296, "learning_rate": 4.1124057573680605e-07, "loss": 2.037, "step": 336 }, { "epoch": 0.6570801852303193, "grad_norm": 0.41594526171684265, "learning_rate": 4.1089787525702535e-07, "loss": 2.0795, "step": 337 }, { "epoch": 0.6590299780648307, "grad_norm": 0.3934768736362457, "learning_rate": 4.1055517477724464e-07, "loss": 2.0045, "step": 338 }, { "epoch": 0.6609797708993419, "grad_norm": 0.3954453766345978, "learning_rate": 4.10212474297464e-07, "loss": 2.0005, "step": 339 }, { "epoch": 0.6629295637338533, "grad_norm": 0.42002055048942566, "learning_rate": 4.0986977381768334e-07, "loss": 2.0781, "step": 340 }, { "epoch": 0.6648793565683646, "grad_norm": 0.3964640200138092, "learning_rate": 4.0952707333790263e-07, "loss": 2.1088, "step": 341 }, { "epoch": 0.666829149402876, "grad_norm": 0.3742097318172455, "learning_rate": 4.09184372858122e-07, "loss": 2.0201, "step": 342 }, { "epoch": 0.6687789422373873, "grad_norm": 0.40264692902565, "learning_rate": 4.088416723783413e-07, "loss": 2.0927, "step": 343 }, { "epoch": 0.6707287350718986, "grad_norm": 0.39995405077934265, "learning_rate": 4.084989718985606e-07, "loss": 2.0783, "step": 344 }, { "epoch": 0.6726785279064099, "grad_norm": 0.39974457025527954, "learning_rate": 4.0815627141877997e-07, "loss": 2.0613, "step": 345 }, { "epoch": 0.6746283207409213, "grad_norm": 0.39440110325813293, "learning_rate": 4.078135709389993e-07, "loss": 2.0963, "step": 346 }, { "epoch": 0.6765781135754326, "grad_norm": 0.40362536907196045, "learning_rate": 4.074708704592186e-07, "loss": 2.138, "step": 347 }, { "epoch": 0.678527906409944, "grad_norm": 0.4271102547645569, "learning_rate": 4.0712816997943796e-07, "loss": 2.0668, "step": 348 }, { "epoch": 0.6804776992444552, "grad_norm": 0.3873864412307739, "learning_rate": 4.067854694996573e-07, "loss": 2.0236, "step": 349 }, { "epoch": 0.6824274920789666, "grad_norm": 0.39676573872566223, "learning_rate": 4.064427690198766e-07, "loss": 2.0723, "step": 350 }, { "epoch": 0.6843772849134779, "grad_norm": 0.3926120102405548, "learning_rate": 4.0610006854009595e-07, "loss": 2.0193, "step": 351 }, { "epoch": 0.6863270777479893, "grad_norm": 0.3857557773590088, "learning_rate": 4.0575736806031524e-07, "loss": 2.0574, "step": 352 }, { "epoch": 0.6882768705825006, "grad_norm": 0.4042007327079773, "learning_rate": 4.054146675805346e-07, "loss": 2.0196, "step": 353 }, { "epoch": 0.6902266634170119, "grad_norm": 0.3976573944091797, "learning_rate": 4.0507196710075394e-07, "loss": 1.9201, "step": 354 }, { "epoch": 0.6921764562515232, "grad_norm": 0.38179242610931396, "learning_rate": 4.0472926662097323e-07, "loss": 2.0551, "step": 355 }, { "epoch": 0.6941262490860346, "grad_norm": 0.4144536256790161, "learning_rate": 4.043865661411926e-07, "loss": 2.0633, "step": 356 }, { "epoch": 0.696076041920546, "grad_norm": 0.42070674896240234, "learning_rate": 4.040438656614119e-07, "loss": 2.1222, "step": 357 }, { "epoch": 0.6980258347550573, "grad_norm": 0.394010066986084, "learning_rate": 4.037011651816312e-07, "loss": 2.0497, "step": 358 }, { "epoch": 0.6999756275895687, "grad_norm": 0.40751656889915466, "learning_rate": 4.0335846470185057e-07, "loss": 2.0554, "step": 359 }, { "epoch": 0.7019254204240799, "grad_norm": 0.3723933696746826, "learning_rate": 4.030157642220699e-07, "loss": 1.9727, "step": 360 }, { "epoch": 0.7038752132585913, "grad_norm": 0.3941795825958252, "learning_rate": 4.026730637422892e-07, "loss": 2.0793, "step": 361 }, { "epoch": 0.7058250060931026, "grad_norm": 0.3988247513771057, "learning_rate": 4.0233036326250856e-07, "loss": 2.1244, "step": 362 }, { "epoch": 0.707774798927614, "grad_norm": 0.409525603055954, "learning_rate": 4.019876627827279e-07, "loss": 2.0778, "step": 363 }, { "epoch": 0.7097245917621253, "grad_norm": 0.37638112902641296, "learning_rate": 4.016449623029472e-07, "loss": 1.9827, "step": 364 }, { "epoch": 0.7116743845966366, "grad_norm": 0.41931676864624023, "learning_rate": 4.013022618231665e-07, "loss": 2.0805, "step": 365 }, { "epoch": 0.7136241774311479, "grad_norm": 0.391668438911438, "learning_rate": 4.009595613433859e-07, "loss": 2.0695, "step": 366 }, { "epoch": 0.7155739702656593, "grad_norm": 0.4082440733909607, "learning_rate": 4.006168608636052e-07, "loss": 2.0232, "step": 367 }, { "epoch": 0.7175237631001706, "grad_norm": 0.41394224762916565, "learning_rate": 4.002741603838245e-07, "loss": 2.024, "step": 368 }, { "epoch": 0.719473555934682, "grad_norm": 0.41648924350738525, "learning_rate": 3.999314599040439e-07, "loss": 2.0108, "step": 369 }, { "epoch": 0.7214233487691932, "grad_norm": 0.408218652009964, "learning_rate": 3.995887594242632e-07, "loss": 2.0712, "step": 370 }, { "epoch": 0.7233731416037046, "grad_norm": 0.39029547572135925, "learning_rate": 3.9924605894448247e-07, "loss": 2.0475, "step": 371 }, { "epoch": 0.7253229344382159, "grad_norm": 0.4242095649242401, "learning_rate": 3.9890335846470187e-07, "loss": 2.0507, "step": 372 }, { "epoch": 0.7272727272727273, "grad_norm": 0.3876708745956421, "learning_rate": 3.9856065798492117e-07, "loss": 2.0161, "step": 373 }, { "epoch": 0.7292225201072386, "grad_norm": 0.41027507185935974, "learning_rate": 3.9821795750514046e-07, "loss": 2.0544, "step": 374 }, { "epoch": 0.7311723129417499, "grad_norm": 0.4173310697078705, "learning_rate": 3.9787525702535986e-07, "loss": 2.0615, "step": 375 }, { "epoch": 0.7331221057762612, "grad_norm": 0.40106937289237976, "learning_rate": 3.9753255654557916e-07, "loss": 2.0189, "step": 376 }, { "epoch": 0.7350718986107726, "grad_norm": 0.40185120701789856, "learning_rate": 3.9718985606579845e-07, "loss": 2.0624, "step": 377 }, { "epoch": 0.737021691445284, "grad_norm": 0.39821675419807434, "learning_rate": 3.968471555860178e-07, "loss": 2.0664, "step": 378 }, { "epoch": 0.7389714842797953, "grad_norm": 0.4365295171737671, "learning_rate": 3.9650445510623715e-07, "loss": 2.065, "step": 379 }, { "epoch": 0.7409212771143066, "grad_norm": 0.40240806341171265, "learning_rate": 3.9616175462645644e-07, "loss": 2.0526, "step": 380 }, { "epoch": 0.7428710699488179, "grad_norm": 0.4148831069469452, "learning_rate": 3.958190541466758e-07, "loss": 2.1255, "step": 381 }, { "epoch": 0.7448208627833293, "grad_norm": 0.4301227033138275, "learning_rate": 3.9547635366689514e-07, "loss": 2.0715, "step": 382 }, { "epoch": 0.7467706556178406, "grad_norm": 0.42958423495292664, "learning_rate": 3.9513365318711443e-07, "loss": 2.0762, "step": 383 }, { "epoch": 0.748720448452352, "grad_norm": 0.40311166644096375, "learning_rate": 3.947909527073338e-07, "loss": 2.0102, "step": 384 }, { "epoch": 0.7506702412868632, "grad_norm": 0.41303250193595886, "learning_rate": 3.944482522275531e-07, "loss": 2.0435, "step": 385 }, { "epoch": 0.7526200341213746, "grad_norm": 0.4167964458465576, "learning_rate": 3.941055517477724e-07, "loss": 2.0648, "step": 386 }, { "epoch": 0.7545698269558859, "grad_norm": 0.39250755310058594, "learning_rate": 3.9376285126799177e-07, "loss": 2.032, "step": 387 }, { "epoch": 0.7565196197903973, "grad_norm": 0.41534167528152466, "learning_rate": 3.9342015078821106e-07, "loss": 2.023, "step": 388 }, { "epoch": 0.7584694126249086, "grad_norm": 0.4158441424369812, "learning_rate": 3.930774503084304e-07, "loss": 2.1015, "step": 389 }, { "epoch": 0.76041920545942, "grad_norm": 0.39154303073883057, "learning_rate": 3.9273474982864976e-07, "loss": 2.0166, "step": 390 }, { "epoch": 0.7623689982939312, "grad_norm": 0.3865329325199127, "learning_rate": 3.9239204934886905e-07, "loss": 2.0209, "step": 391 }, { "epoch": 0.7643187911284426, "grad_norm": 0.4046148955821991, "learning_rate": 3.920493488690884e-07, "loss": 2.0501, "step": 392 }, { "epoch": 0.7662685839629539, "grad_norm": 0.4096246659755707, "learning_rate": 3.9170664838930775e-07, "loss": 2.0377, "step": 393 }, { "epoch": 0.7682183767974653, "grad_norm": 0.40363749861717224, "learning_rate": 3.9136394790952704e-07, "loss": 2.0315, "step": 394 }, { "epoch": 0.7701681696319767, "grad_norm": 0.4038202166557312, "learning_rate": 3.910212474297464e-07, "loss": 1.9516, "step": 395 }, { "epoch": 0.7721179624664879, "grad_norm": 0.3979615271091461, "learning_rate": 3.9067854694996574e-07, "loss": 2.02, "step": 396 }, { "epoch": 0.7740677553009992, "grad_norm": 0.4166601896286011, "learning_rate": 3.9033584647018503e-07, "loss": 2.0672, "step": 397 }, { "epoch": 0.7760175481355106, "grad_norm": 0.4038446545600891, "learning_rate": 3.899931459904044e-07, "loss": 2.0183, "step": 398 }, { "epoch": 0.777967340970022, "grad_norm": 0.4230453670024872, "learning_rate": 3.896504455106237e-07, "loss": 2.0234, "step": 399 }, { "epoch": 0.7799171338045333, "grad_norm": 0.4244215190410614, "learning_rate": 3.89307745030843e-07, "loss": 2.0863, "step": 400 }, { "epoch": 0.7818669266390446, "grad_norm": 0.42174607515335083, "learning_rate": 3.889650445510623e-07, "loss": 2.0775, "step": 401 }, { "epoch": 0.7838167194735559, "grad_norm": 0.4019846022129059, "learning_rate": 3.886223440712817e-07, "loss": 2.0445, "step": 402 }, { "epoch": 0.7857665123080673, "grad_norm": 0.4168083965778351, "learning_rate": 3.88279643591501e-07, "loss": 2.0457, "step": 403 }, { "epoch": 0.7877163051425786, "grad_norm": 0.4132064878940582, "learning_rate": 3.879369431117203e-07, "loss": 2.0637, "step": 404 }, { "epoch": 0.78966609797709, "grad_norm": 0.4239768981933594, "learning_rate": 3.875942426319397e-07, "loss": 2.0512, "step": 405 }, { "epoch": 0.7916158908116012, "grad_norm": 0.4192203879356384, "learning_rate": 3.87251542152159e-07, "loss": 2.0766, "step": 406 }, { "epoch": 0.7935656836461126, "grad_norm": 0.4393591582775116, "learning_rate": 3.869088416723783e-07, "loss": 2.0497, "step": 407 }, { "epoch": 0.7955154764806239, "grad_norm": 0.417614221572876, "learning_rate": 3.865661411925977e-07, "loss": 2.0518, "step": 408 }, { "epoch": 0.7974652693151353, "grad_norm": 0.4034237563610077, "learning_rate": 3.86223440712817e-07, "loss": 2.0604, "step": 409 }, { "epoch": 0.7994150621496466, "grad_norm": 0.4287107586860657, "learning_rate": 3.858807402330363e-07, "loss": 2.0386, "step": 410 }, { "epoch": 0.8013648549841579, "grad_norm": 0.4140661656856537, "learning_rate": 3.855380397532557e-07, "loss": 2.108, "step": 411 }, { "epoch": 0.8033146478186692, "grad_norm": 0.4189471900463104, "learning_rate": 3.85195339273475e-07, "loss": 2.0894, "step": 412 }, { "epoch": 0.8052644406531806, "grad_norm": 0.4111238121986389, "learning_rate": 3.8485263879369427e-07, "loss": 2.051, "step": 413 }, { "epoch": 0.807214233487692, "grad_norm": 0.4296090006828308, "learning_rate": 3.845099383139136e-07, "loss": 2.0484, "step": 414 }, { "epoch": 0.8091640263222033, "grad_norm": 0.4000217020511627, "learning_rate": 3.8416723783413297e-07, "loss": 2.0449, "step": 415 }, { "epoch": 0.8111138191567145, "grad_norm": 0.44013938307762146, "learning_rate": 3.8382453735435226e-07, "loss": 2.1467, "step": 416 }, { "epoch": 0.8130636119912259, "grad_norm": 0.4252108633518219, "learning_rate": 3.834818368745716e-07, "loss": 2.0725, "step": 417 }, { "epoch": 0.8150134048257373, "grad_norm": 0.41153863072395325, "learning_rate": 3.8313913639479096e-07, "loss": 2.0829, "step": 418 }, { "epoch": 0.8169631976602486, "grad_norm": 0.417043536901474, "learning_rate": 3.8279643591501025e-07, "loss": 1.9899, "step": 419 }, { "epoch": 0.81891299049476, "grad_norm": 0.41520485281944275, "learning_rate": 3.824537354352296e-07, "loss": 1.9941, "step": 420 }, { "epoch": 0.8208627833292713, "grad_norm": 0.4316999912261963, "learning_rate": 3.8211103495544895e-07, "loss": 2.051, "step": 421 }, { "epoch": 0.8228125761637826, "grad_norm": 0.4300172030925751, "learning_rate": 3.8176833447566824e-07, "loss": 2.025, "step": 422 }, { "epoch": 0.8247623689982939, "grad_norm": 0.4366534650325775, "learning_rate": 3.814256339958876e-07, "loss": 2.1326, "step": 423 }, { "epoch": 0.8267121618328053, "grad_norm": 0.412256121635437, "learning_rate": 3.8108293351610694e-07, "loss": 1.9799, "step": 424 }, { "epoch": 0.8286619546673166, "grad_norm": 0.4404711425304413, "learning_rate": 3.8074023303632623e-07, "loss": 2.0618, "step": 425 }, { "epoch": 0.830611747501828, "grad_norm": 0.41743820905685425, "learning_rate": 3.803975325565456e-07, "loss": 2.0293, "step": 426 }, { "epoch": 0.8325615403363392, "grad_norm": 0.40452542901039124, "learning_rate": 3.8005483207676487e-07, "loss": 2.0561, "step": 427 }, { "epoch": 0.8345113331708506, "grad_norm": 0.41732680797576904, "learning_rate": 3.797121315969842e-07, "loss": 1.9826, "step": 428 }, { "epoch": 0.8364611260053619, "grad_norm": 0.43309998512268066, "learning_rate": 3.7936943111720357e-07, "loss": 2.0313, "step": 429 }, { "epoch": 0.8384109188398733, "grad_norm": 0.43594348430633545, "learning_rate": 3.7902673063742286e-07, "loss": 2.0437, "step": 430 }, { "epoch": 0.8403607116743846, "grad_norm": 0.43290477991104126, "learning_rate": 3.786840301576422e-07, "loss": 2.1213, "step": 431 }, { "epoch": 0.8423105045088959, "grad_norm": 0.4143589735031128, "learning_rate": 3.7834132967786156e-07, "loss": 2.0327, "step": 432 }, { "epoch": 0.8442602973434072, "grad_norm": 0.4311947226524353, "learning_rate": 3.7799862919808085e-07, "loss": 2.0604, "step": 433 }, { "epoch": 0.8462100901779186, "grad_norm": 0.4119859039783478, "learning_rate": 3.776559287183002e-07, "loss": 2.0091, "step": 434 }, { "epoch": 0.84815988301243, "grad_norm": 0.4251650869846344, "learning_rate": 3.7731322823851955e-07, "loss": 2.05, "step": 435 }, { "epoch": 0.8501096758469413, "grad_norm": 0.4295788109302521, "learning_rate": 3.7697052775873884e-07, "loss": 2.0231, "step": 436 }, { "epoch": 0.8520594686814525, "grad_norm": 0.4099411964416504, "learning_rate": 3.766278272789582e-07, "loss": 2.1037, "step": 437 }, { "epoch": 0.8540092615159639, "grad_norm": 0.41294169425964355, "learning_rate": 3.7628512679917754e-07, "loss": 2.0535, "step": 438 }, { "epoch": 0.8559590543504753, "grad_norm": 0.4004737138748169, "learning_rate": 3.7594242631939683e-07, "loss": 2.0395, "step": 439 }, { "epoch": 0.8579088471849866, "grad_norm": 0.40913403034210205, "learning_rate": 3.755997258396161e-07, "loss": 1.9947, "step": 440 }, { "epoch": 0.859858640019498, "grad_norm": 0.41119128465652466, "learning_rate": 3.752570253598355e-07, "loss": 1.9859, "step": 441 }, { "epoch": 0.8618084328540092, "grad_norm": 0.44417282938957214, "learning_rate": 3.749143248800548e-07, "loss": 2.0712, "step": 442 }, { "epoch": 0.8637582256885206, "grad_norm": 0.41587620973587036, "learning_rate": 3.745716244002741e-07, "loss": 1.9921, "step": 443 }, { "epoch": 0.8657080185230319, "grad_norm": 0.4235389530658722, "learning_rate": 3.742289239204935e-07, "loss": 1.9941, "step": 444 }, { "epoch": 0.8676578113575433, "grad_norm": 0.4219055771827698, "learning_rate": 3.738862234407128e-07, "loss": 2.0621, "step": 445 }, { "epoch": 0.8696076041920546, "grad_norm": 0.42184367775917053, "learning_rate": 3.735435229609321e-07, "loss": 2.0307, "step": 446 }, { "epoch": 0.8715573970265659, "grad_norm": 0.39649975299835205, "learning_rate": 3.732008224811515e-07, "loss": 2.0264, "step": 447 }, { "epoch": 0.8735071898610772, "grad_norm": 0.4187317490577698, "learning_rate": 3.728581220013708e-07, "loss": 1.9778, "step": 448 }, { "epoch": 0.8754569826955886, "grad_norm": 0.41368138790130615, "learning_rate": 3.725154215215901e-07, "loss": 1.9953, "step": 449 }, { "epoch": 0.8774067755300999, "grad_norm": 0.4397999942302704, "learning_rate": 3.721727210418095e-07, "loss": 2.0835, "step": 450 }, { "epoch": 0.8793565683646113, "grad_norm": 0.41927337646484375, "learning_rate": 3.718300205620288e-07, "loss": 2.0307, "step": 451 }, { "epoch": 0.8813063611991226, "grad_norm": 0.43216344714164734, "learning_rate": 3.714873200822481e-07, "loss": 2.0669, "step": 452 }, { "epoch": 0.8832561540336339, "grad_norm": 0.4566250741481781, "learning_rate": 3.711446196024674e-07, "loss": 2.0423, "step": 453 }, { "epoch": 0.8852059468681452, "grad_norm": 0.4399709701538086, "learning_rate": 3.708019191226868e-07, "loss": 2.0859, "step": 454 }, { "epoch": 0.8871557397026566, "grad_norm": 0.44788333773612976, "learning_rate": 3.7045921864290607e-07, "loss": 2.0349, "step": 455 }, { "epoch": 0.889105532537168, "grad_norm": 0.4182490110397339, "learning_rate": 3.7011651816312537e-07, "loss": 1.9921, "step": 456 }, { "epoch": 0.8910553253716793, "grad_norm": 0.4325038194656372, "learning_rate": 3.6977381768334477e-07, "loss": 2.0419, "step": 457 }, { "epoch": 0.8930051182061906, "grad_norm": 0.48611199855804443, "learning_rate": 3.6943111720356406e-07, "loss": 2.1572, "step": 458 }, { "epoch": 0.8949549110407019, "grad_norm": 0.4303911030292511, "learning_rate": 3.6908841672378336e-07, "loss": 2.0137, "step": 459 }, { "epoch": 0.8969047038752133, "grad_norm": 0.4397573173046112, "learning_rate": 3.6874571624400276e-07, "loss": 2.0199, "step": 460 }, { "epoch": 0.8988544967097246, "grad_norm": 0.4570363163948059, "learning_rate": 3.6840301576422205e-07, "loss": 2.0648, "step": 461 }, { "epoch": 0.900804289544236, "grad_norm": 0.43259698152542114, "learning_rate": 3.6806031528444135e-07, "loss": 2.0121, "step": 462 }, { "epoch": 0.9027540823787472, "grad_norm": 0.44078147411346436, "learning_rate": 3.6771761480466075e-07, "loss": 2.0422, "step": 463 }, { "epoch": 0.9047038752132586, "grad_norm": 0.4169975519180298, "learning_rate": 3.6737491432488004e-07, "loss": 2.0453, "step": 464 }, { "epoch": 0.9066536680477699, "grad_norm": 0.44096165895462036, "learning_rate": 3.6703221384509934e-07, "loss": 2.0722, "step": 465 }, { "epoch": 0.9086034608822813, "grad_norm": 0.4220427870750427, "learning_rate": 3.666895133653187e-07, "loss": 2.052, "step": 466 }, { "epoch": 0.9105532537167926, "grad_norm": 0.41613534092903137, "learning_rate": 3.6634681288553803e-07, "loss": 2.0031, "step": 467 }, { "epoch": 0.9125030465513039, "grad_norm": 0.4290630519390106, "learning_rate": 3.660041124057573e-07, "loss": 2.108, "step": 468 }, { "epoch": 0.9144528393858152, "grad_norm": 0.41508668661117554, "learning_rate": 3.6566141192597667e-07, "loss": 2.0369, "step": 469 }, { "epoch": 0.9164026322203266, "grad_norm": 0.4051671326160431, "learning_rate": 3.65318711446196e-07, "loss": 2.0593, "step": 470 }, { "epoch": 0.9183524250548379, "grad_norm": 0.427229642868042, "learning_rate": 3.649760109664153e-07, "loss": 2.0303, "step": 471 }, { "epoch": 0.9203022178893493, "grad_norm": 0.408236026763916, "learning_rate": 3.6463331048663466e-07, "loss": 2.0537, "step": 472 }, { "epoch": 0.9222520107238605, "grad_norm": 0.4055333435535431, "learning_rate": 3.64290610006854e-07, "loss": 1.9684, "step": 473 }, { "epoch": 0.9242018035583719, "grad_norm": 0.4198017418384552, "learning_rate": 3.639479095270733e-07, "loss": 2.0429, "step": 474 }, { "epoch": 0.9261515963928832, "grad_norm": 0.4309008717536926, "learning_rate": 3.6360520904729265e-07, "loss": 2.0844, "step": 475 }, { "epoch": 0.9281013892273946, "grad_norm": 0.4177336096763611, "learning_rate": 3.63262508567512e-07, "loss": 2.0082, "step": 476 }, { "epoch": 0.930051182061906, "grad_norm": 0.42606329917907715, "learning_rate": 3.629198080877313e-07, "loss": 2.0371, "step": 477 }, { "epoch": 0.9320009748964172, "grad_norm": 0.4223528504371643, "learning_rate": 3.6257710760795064e-07, "loss": 2.0128, "step": 478 }, { "epoch": 0.9339507677309286, "grad_norm": 0.43999001383781433, "learning_rate": 3.6223440712816994e-07, "loss": 1.9984, "step": 479 }, { "epoch": 0.9359005605654399, "grad_norm": 0.44352471828460693, "learning_rate": 3.618917066483893e-07, "loss": 2.0501, "step": 480 }, { "epoch": 0.9378503533999513, "grad_norm": 0.4229583740234375, "learning_rate": 3.6154900616860863e-07, "loss": 2.0403, "step": 481 }, { "epoch": 0.9398001462344626, "grad_norm": 0.4202549457550049, "learning_rate": 3.612063056888279e-07, "loss": 1.9893, "step": 482 }, { "epoch": 0.941749939068974, "grad_norm": 0.4364420771598816, "learning_rate": 3.6086360520904727e-07, "loss": 1.9953, "step": 483 }, { "epoch": 0.9436997319034852, "grad_norm": 0.4317263662815094, "learning_rate": 3.605209047292666e-07, "loss": 2.0787, "step": 484 }, { "epoch": 0.9456495247379966, "grad_norm": 0.44858187437057495, "learning_rate": 3.601782042494859e-07, "loss": 2.1139, "step": 485 }, { "epoch": 0.9475993175725079, "grad_norm": 0.4311455488204956, "learning_rate": 3.5983550376970526e-07, "loss": 2.0409, "step": 486 }, { "epoch": 0.9495491104070193, "grad_norm": 0.42990413308143616, "learning_rate": 3.594928032899246e-07, "loss": 2.0478, "step": 487 }, { "epoch": 0.9514989032415306, "grad_norm": 0.4484078288078308, "learning_rate": 3.591501028101439e-07, "loss": 1.9989, "step": 488 }, { "epoch": 0.9534486960760419, "grad_norm": 0.438047856092453, "learning_rate": 3.5880740233036325e-07, "loss": 2.0468, "step": 489 }, { "epoch": 0.9553984889105532, "grad_norm": 0.4557168483734131, "learning_rate": 3.584647018505826e-07, "loss": 2.1145, "step": 490 }, { "epoch": 0.9573482817450646, "grad_norm": 0.41166436672210693, "learning_rate": 3.581220013708019e-07, "loss": 2.0639, "step": 491 }, { "epoch": 0.9592980745795759, "grad_norm": 0.4612530767917633, "learning_rate": 3.577793008910212e-07, "loss": 2.0139, "step": 492 }, { "epoch": 0.9612478674140873, "grad_norm": 0.4352019429206848, "learning_rate": 3.574366004112406e-07, "loss": 2.0984, "step": 493 }, { "epoch": 0.9631976602485985, "grad_norm": 0.4246942400932312, "learning_rate": 3.570938999314599e-07, "loss": 2.054, "step": 494 }, { "epoch": 0.9651474530831099, "grad_norm": 0.4309667646884918, "learning_rate": 3.567511994516792e-07, "loss": 1.9942, "step": 495 }, { "epoch": 0.9670972459176213, "grad_norm": 0.4459112584590912, "learning_rate": 3.564084989718986e-07, "loss": 2.0221, "step": 496 }, { "epoch": 0.9690470387521326, "grad_norm": 0.44149142503738403, "learning_rate": 3.5606579849211787e-07, "loss": 2.0181, "step": 497 }, { "epoch": 0.970996831586644, "grad_norm": 0.4406503736972809, "learning_rate": 3.5572309801233717e-07, "loss": 2.0666, "step": 498 }, { "epoch": 0.9729466244211552, "grad_norm": 0.4117674231529236, "learning_rate": 3.5538039753255657e-07, "loss": 1.982, "step": 499 }, { "epoch": 0.9748964172556666, "grad_norm": 0.43600788712501526, "learning_rate": 3.5503769705277586e-07, "loss": 1.9772, "step": 500 }, { "epoch": 0.9768462100901779, "grad_norm": 0.42391106486320496, "learning_rate": 3.5469499657299516e-07, "loss": 2.0304, "step": 501 }, { "epoch": 0.9787960029246893, "grad_norm": 0.44462934136390686, "learning_rate": 3.5435229609321456e-07, "loss": 2.0374, "step": 502 }, { "epoch": 0.9807457957592006, "grad_norm": 0.45238927006721497, "learning_rate": 3.5400959561343385e-07, "loss": 2.057, "step": 503 }, { "epoch": 0.9826955885937119, "grad_norm": 0.43034645915031433, "learning_rate": 3.5366689513365315e-07, "loss": 2.0392, "step": 504 }, { "epoch": 0.9846453814282232, "grad_norm": 0.42902877926826477, "learning_rate": 3.533241946538725e-07, "loss": 2.045, "step": 505 }, { "epoch": 0.9865951742627346, "grad_norm": 0.4340520203113556, "learning_rate": 3.5298149417409184e-07, "loss": 2.0439, "step": 506 }, { "epoch": 0.9885449670972459, "grad_norm": 0.45374131202697754, "learning_rate": 3.5263879369431114e-07, "loss": 2.0431, "step": 507 }, { "epoch": 0.9904947599317573, "grad_norm": 0.44037064909935, "learning_rate": 3.522960932145305e-07, "loss": 2.0123, "step": 508 }, { "epoch": 0.9924445527662685, "grad_norm": 0.42846593260765076, "learning_rate": 3.5195339273474983e-07, "loss": 1.9661, "step": 509 }, { "epoch": 0.9943943456007799, "grad_norm": 0.4789009392261505, "learning_rate": 3.516106922549691e-07, "loss": 2.0753, "step": 510 }, { "epoch": 0.9963441384352912, "grad_norm": 0.44283124804496765, "learning_rate": 3.5126799177518847e-07, "loss": 2.0581, "step": 511 }, { "epoch": 0.9982939312698026, "grad_norm": 0.43828728795051575, "learning_rate": 3.509252912954078e-07, "loss": 2.05, "step": 512 }, { "epoch": 0.9982939312698026, "eval_loss": 2.046032667160034, "eval_runtime": 481.0273, "eval_samples_per_second": 1.293, "eval_steps_per_second": 0.324, "step": 512 } ], "logging_steps": 1, "max_steps": 1536, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5142103496146289e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }