ssunggun2's picture
Upload folder using huggingface_hub
0028297 verified
raw
history blame
179 kB
{
"best_metric": 2.0196783542633057,
"best_model_checkpoint": "/home/sunggeunan/data/ICL/outputs/lora/SKIML-ICL_mrqa_nq_v3/Meta-Llama-3-8B-Instruct-unanswerable-5Q-0U-0C-qa_first/checkpoint-1025",
"epoch": 1.9985376553741165,
"eval_steps": 500,
"global_step": 1025,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019497928345113332,
"grad_norm": 0.2671431005001068,
"learning_rate": 6.493506493506494e-09,
"loss": 2.0238,
"step": 1
},
{
"epoch": 0.0038995856690226664,
"grad_norm": 0.26295146346092224,
"learning_rate": 1.2987012987012988e-08,
"loss": 2.0792,
"step": 2
},
{
"epoch": 0.005849378503534,
"grad_norm": 0.26511502265930176,
"learning_rate": 1.9480519480519478e-08,
"loss": 2.1298,
"step": 3
},
{
"epoch": 0.007799171338045333,
"grad_norm": 0.268216997385025,
"learning_rate": 2.5974025974025976e-08,
"loss": 2.0854,
"step": 4
},
{
"epoch": 0.009748964172556666,
"grad_norm": 0.2698403000831604,
"learning_rate": 3.246753246753246e-08,
"loss": 2.0665,
"step": 5
},
{
"epoch": 0.011698757007068,
"grad_norm": 0.2657904624938965,
"learning_rate": 3.8961038961038956e-08,
"loss": 2.0213,
"step": 6
},
{
"epoch": 0.013648549841579332,
"grad_norm": 0.2607410252094269,
"learning_rate": 4.545454545454545e-08,
"loss": 2.0425,
"step": 7
},
{
"epoch": 0.015598342676090666,
"grad_norm": 0.28946036100387573,
"learning_rate": 5.194805194805195e-08,
"loss": 2.0742,
"step": 8
},
{
"epoch": 0.017548135510601998,
"grad_norm": 0.250527948141098,
"learning_rate": 5.844155844155844e-08,
"loss": 2.1037,
"step": 9
},
{
"epoch": 0.01949792834511333,
"grad_norm": 0.29370346665382385,
"learning_rate": 6.493506493506492e-08,
"loss": 2.1355,
"step": 10
},
{
"epoch": 0.021447721179624665,
"grad_norm": 0.2751532196998596,
"learning_rate": 7.142857142857142e-08,
"loss": 2.1219,
"step": 11
},
{
"epoch": 0.023397514014136,
"grad_norm": 0.2966114282608032,
"learning_rate": 7.792207792207791e-08,
"loss": 2.1788,
"step": 12
},
{
"epoch": 0.02534730684864733,
"grad_norm": 0.24350005388259888,
"learning_rate": 8.441558441558441e-08,
"loss": 2.0531,
"step": 13
},
{
"epoch": 0.027297099683158663,
"grad_norm": 0.2536744177341461,
"learning_rate": 9.09090909090909e-08,
"loss": 2.0485,
"step": 14
},
{
"epoch": 0.029246892517669997,
"grad_norm": 0.2583434581756592,
"learning_rate": 9.74025974025974e-08,
"loss": 2.0712,
"step": 15
},
{
"epoch": 0.03119668535218133,
"grad_norm": 0.25572890043258667,
"learning_rate": 1.038961038961039e-07,
"loss": 2.0674,
"step": 16
},
{
"epoch": 0.03314647818669266,
"grad_norm": 0.24798272550106049,
"learning_rate": 1.1038961038961038e-07,
"loss": 1.9777,
"step": 17
},
{
"epoch": 0.035096271021203995,
"grad_norm": 0.25968796014785767,
"learning_rate": 1.1688311688311688e-07,
"loss": 2.1233,
"step": 18
},
{
"epoch": 0.03704606385571533,
"grad_norm": 0.2510642111301422,
"learning_rate": 1.2337662337662337e-07,
"loss": 2.0819,
"step": 19
},
{
"epoch": 0.03899585669022666,
"grad_norm": 0.2636696696281433,
"learning_rate": 1.2987012987012984e-07,
"loss": 2.1369,
"step": 20
},
{
"epoch": 0.040945649524738,
"grad_norm": 0.26741182804107666,
"learning_rate": 1.3636363636363635e-07,
"loss": 2.0973,
"step": 21
},
{
"epoch": 0.04289544235924933,
"grad_norm": 0.2516593933105469,
"learning_rate": 1.4285714285714285e-07,
"loss": 2.1089,
"step": 22
},
{
"epoch": 0.044845235193760664,
"grad_norm": 0.2642120122909546,
"learning_rate": 1.4935064935064935e-07,
"loss": 2.069,
"step": 23
},
{
"epoch": 0.046795028028272,
"grad_norm": 0.2595269978046417,
"learning_rate": 1.5584415584415582e-07,
"loss": 2.1304,
"step": 24
},
{
"epoch": 0.04874482086278333,
"grad_norm": 0.2557779848575592,
"learning_rate": 1.6233766233766232e-07,
"loss": 2.0084,
"step": 25
},
{
"epoch": 0.05069461369729466,
"grad_norm": 0.26405468583106995,
"learning_rate": 1.6883116883116883e-07,
"loss": 2.0683,
"step": 26
},
{
"epoch": 0.05264440653180599,
"grad_norm": 0.2540312111377716,
"learning_rate": 1.7532467532467533e-07,
"loss": 2.1389,
"step": 27
},
{
"epoch": 0.05459419936631733,
"grad_norm": 0.2732296586036682,
"learning_rate": 1.818181818181818e-07,
"loss": 2.0663,
"step": 28
},
{
"epoch": 0.05654399220082866,
"grad_norm": 0.2802280783653259,
"learning_rate": 1.883116883116883e-07,
"loss": 2.0758,
"step": 29
},
{
"epoch": 0.058493785035339994,
"grad_norm": 0.2741639018058777,
"learning_rate": 1.948051948051948e-07,
"loss": 2.0638,
"step": 30
},
{
"epoch": 0.06044357786985133,
"grad_norm": 0.2648272216320038,
"learning_rate": 2.012987012987013e-07,
"loss": 2.0978,
"step": 31
},
{
"epoch": 0.06239337070436266,
"grad_norm": 0.2700302004814148,
"learning_rate": 2.077922077922078e-07,
"loss": 2.1145,
"step": 32
},
{
"epoch": 0.064343163538874,
"grad_norm": 0.24180686473846436,
"learning_rate": 2.1428571428571426e-07,
"loss": 2.0752,
"step": 33
},
{
"epoch": 0.06629295637338532,
"grad_norm": 0.27451491355895996,
"learning_rate": 2.2077922077922076e-07,
"loss": 2.0719,
"step": 34
},
{
"epoch": 0.06824274920789666,
"grad_norm": 0.2594657838344574,
"learning_rate": 2.2727272727272726e-07,
"loss": 2.0107,
"step": 35
},
{
"epoch": 0.07019254204240799,
"grad_norm": 0.26720282435417175,
"learning_rate": 2.3376623376623376e-07,
"loss": 2.1045,
"step": 36
},
{
"epoch": 0.07214233487691933,
"grad_norm": 0.2727048695087433,
"learning_rate": 2.4025974025974024e-07,
"loss": 2.0983,
"step": 37
},
{
"epoch": 0.07409212771143066,
"grad_norm": 0.2821039855480194,
"learning_rate": 2.4675324675324674e-07,
"loss": 2.1199,
"step": 38
},
{
"epoch": 0.076041920545942,
"grad_norm": 0.2540994882583618,
"learning_rate": 2.532467532467532e-07,
"loss": 2.0925,
"step": 39
},
{
"epoch": 0.07799171338045333,
"grad_norm": 0.2766543924808502,
"learning_rate": 2.597402597402597e-07,
"loss": 2.1259,
"step": 40
},
{
"epoch": 0.07994150621496467,
"grad_norm": 0.28683698177337646,
"learning_rate": 2.662337662337662e-07,
"loss": 2.135,
"step": 41
},
{
"epoch": 0.081891299049476,
"grad_norm": 0.25892165303230286,
"learning_rate": 2.727272727272727e-07,
"loss": 2.0734,
"step": 42
},
{
"epoch": 0.08384109188398732,
"grad_norm": 0.2723507881164551,
"learning_rate": 2.792207792207792e-07,
"loss": 2.0313,
"step": 43
},
{
"epoch": 0.08579088471849866,
"grad_norm": 0.25262904167175293,
"learning_rate": 2.857142857142857e-07,
"loss": 2.0777,
"step": 44
},
{
"epoch": 0.08774067755300999,
"grad_norm": 0.26076266169548035,
"learning_rate": 2.922077922077922e-07,
"loss": 2.0877,
"step": 45
},
{
"epoch": 0.08969047038752133,
"grad_norm": 0.2711774408817291,
"learning_rate": 2.987012987012987e-07,
"loss": 2.063,
"step": 46
},
{
"epoch": 0.09164026322203266,
"grad_norm": 0.24715273082256317,
"learning_rate": 3.0519480519480515e-07,
"loss": 2.0698,
"step": 47
},
{
"epoch": 0.093590056056544,
"grad_norm": 0.2721501588821411,
"learning_rate": 3.1168831168831165e-07,
"loss": 2.0192,
"step": 48
},
{
"epoch": 0.09553984889105532,
"grad_norm": 0.2476457953453064,
"learning_rate": 3.1818181818181815e-07,
"loss": 2.0208,
"step": 49
},
{
"epoch": 0.09748964172556666,
"grad_norm": 0.26186031103134155,
"learning_rate": 3.2467532467532465e-07,
"loss": 2.1028,
"step": 50
},
{
"epoch": 0.09943943456007799,
"grad_norm": 0.263841450214386,
"learning_rate": 3.3116883116883115e-07,
"loss": 2.071,
"step": 51
},
{
"epoch": 0.10138922739458932,
"grad_norm": 0.27216637134552,
"learning_rate": 3.3766233766233765e-07,
"loss": 2.0743,
"step": 52
},
{
"epoch": 0.10333902022910066,
"grad_norm": 0.25524261593818665,
"learning_rate": 3.4415584415584415e-07,
"loss": 2.0426,
"step": 53
},
{
"epoch": 0.10528881306361199,
"grad_norm": 0.2809346914291382,
"learning_rate": 3.5064935064935066e-07,
"loss": 2.049,
"step": 54
},
{
"epoch": 0.10723860589812333,
"grad_norm": 0.25672242045402527,
"learning_rate": 3.5714285714285716e-07,
"loss": 2.0213,
"step": 55
},
{
"epoch": 0.10918839873263465,
"grad_norm": 0.2544190585613251,
"learning_rate": 3.636363636363636e-07,
"loss": 2.0663,
"step": 56
},
{
"epoch": 0.111138191567146,
"grad_norm": 0.26028168201446533,
"learning_rate": 3.701298701298701e-07,
"loss": 2.0947,
"step": 57
},
{
"epoch": 0.11308798440165732,
"grad_norm": 0.26112449169158936,
"learning_rate": 3.766233766233766e-07,
"loss": 2.0611,
"step": 58
},
{
"epoch": 0.11503777723616866,
"grad_norm": 0.29020223021507263,
"learning_rate": 3.831168831168831e-07,
"loss": 2.1048,
"step": 59
},
{
"epoch": 0.11698757007067999,
"grad_norm": 0.269167959690094,
"learning_rate": 3.896103896103896e-07,
"loss": 2.0392,
"step": 60
},
{
"epoch": 0.11893736290519133,
"grad_norm": 0.2823875844478607,
"learning_rate": 3.961038961038961e-07,
"loss": 2.1341,
"step": 61
},
{
"epoch": 0.12088715573970266,
"grad_norm": 0.27546533942222595,
"learning_rate": 4.025974025974026e-07,
"loss": 2.0903,
"step": 62
},
{
"epoch": 0.12283694857421398,
"grad_norm": 0.2821657657623291,
"learning_rate": 4.090909090909091e-07,
"loss": 2.1028,
"step": 63
},
{
"epoch": 0.12478674140872532,
"grad_norm": 0.2886088788509369,
"learning_rate": 4.155844155844156e-07,
"loss": 2.0685,
"step": 64
},
{
"epoch": 0.12673653424323666,
"grad_norm": 0.3001558482646942,
"learning_rate": 4.22077922077922e-07,
"loss": 2.0996,
"step": 65
},
{
"epoch": 0.128686327077748,
"grad_norm": 0.24933473765850067,
"learning_rate": 4.285714285714285e-07,
"loss": 2.0242,
"step": 66
},
{
"epoch": 0.13063611991225932,
"grad_norm": 0.27868619561195374,
"learning_rate": 4.35064935064935e-07,
"loss": 2.0535,
"step": 67
},
{
"epoch": 0.13258591274677065,
"grad_norm": 0.29242217540740967,
"learning_rate": 4.415584415584415e-07,
"loss": 2.0379,
"step": 68
},
{
"epoch": 0.134535705581282,
"grad_norm": 0.2707277536392212,
"learning_rate": 4.48051948051948e-07,
"loss": 2.0922,
"step": 69
},
{
"epoch": 0.13648549841579333,
"grad_norm": 0.2940627336502075,
"learning_rate": 4.545454545454545e-07,
"loss": 2.0857,
"step": 70
},
{
"epoch": 0.13843529125030465,
"grad_norm": 0.25989463925361633,
"learning_rate": 4.61038961038961e-07,
"loss": 2.0664,
"step": 71
},
{
"epoch": 0.14038508408481598,
"grad_norm": 0.2827669382095337,
"learning_rate": 4.675324675324675e-07,
"loss": 2.0804,
"step": 72
},
{
"epoch": 0.1423348769193273,
"grad_norm": 0.2898445725440979,
"learning_rate": 4.7402597402597397e-07,
"loss": 2.1116,
"step": 73
},
{
"epoch": 0.14428466975383866,
"grad_norm": 0.2953305244445801,
"learning_rate": 4.805194805194805e-07,
"loss": 2.0997,
"step": 74
},
{
"epoch": 0.14623446258835,
"grad_norm": 0.28880831599235535,
"learning_rate": 4.87012987012987e-07,
"loss": 2.0695,
"step": 75
},
{
"epoch": 0.14818425542286132,
"grad_norm": 0.2893301844596863,
"learning_rate": 4.935064935064935e-07,
"loss": 2.1663,
"step": 76
},
{
"epoch": 0.15013404825737264,
"grad_norm": 0.27863314747810364,
"learning_rate": 5e-07,
"loss": 2.0468,
"step": 77
},
{
"epoch": 0.152083841091884,
"grad_norm": 0.27849143743515015,
"learning_rate": 4.996572995202193e-07,
"loss": 2.0909,
"step": 78
},
{
"epoch": 0.15403363392639532,
"grad_norm": 0.2688325345516205,
"learning_rate": 4.993145990404387e-07,
"loss": 2.1058,
"step": 79
},
{
"epoch": 0.15598342676090665,
"grad_norm": 0.2714349627494812,
"learning_rate": 4.989718985606579e-07,
"loss": 2.0719,
"step": 80
},
{
"epoch": 0.15793321959541798,
"grad_norm": 0.267674058675766,
"learning_rate": 4.986291980808773e-07,
"loss": 2.003,
"step": 81
},
{
"epoch": 0.15988301242992933,
"grad_norm": 0.26871585845947266,
"learning_rate": 4.982864976010966e-07,
"loss": 2.0506,
"step": 82
},
{
"epoch": 0.16183280526444066,
"grad_norm": 0.27725961804389954,
"learning_rate": 4.97943797121316e-07,
"loss": 2.0908,
"step": 83
},
{
"epoch": 0.163782598098952,
"grad_norm": 0.26912689208984375,
"learning_rate": 4.976010966415353e-07,
"loss": 2.1065,
"step": 84
},
{
"epoch": 0.1657323909334633,
"grad_norm": 0.26862508058547974,
"learning_rate": 4.972583961617545e-07,
"loss": 2.0017,
"step": 85
},
{
"epoch": 0.16768218376797464,
"grad_norm": 0.2780780792236328,
"learning_rate": 4.969156956819739e-07,
"loss": 2.0812,
"step": 86
},
{
"epoch": 0.169631976602486,
"grad_norm": 0.2691902816295624,
"learning_rate": 4.965729952021932e-07,
"loss": 2.108,
"step": 87
},
{
"epoch": 0.17158176943699732,
"grad_norm": 0.25564315915107727,
"learning_rate": 4.962302947224126e-07,
"loss": 2.0141,
"step": 88
},
{
"epoch": 0.17353156227150865,
"grad_norm": 0.29978710412979126,
"learning_rate": 4.958875942426319e-07,
"loss": 2.1087,
"step": 89
},
{
"epoch": 0.17548135510601998,
"grad_norm": 0.26945438981056213,
"learning_rate": 4.955448937628513e-07,
"loss": 2.0654,
"step": 90
},
{
"epoch": 0.17743114794053133,
"grad_norm": 0.2857602834701538,
"learning_rate": 4.952021932830705e-07,
"loss": 2.0258,
"step": 91
},
{
"epoch": 0.17938094077504266,
"grad_norm": 0.3205603063106537,
"learning_rate": 4.948594928032899e-07,
"loss": 2.0839,
"step": 92
},
{
"epoch": 0.18133073360955398,
"grad_norm": 0.29022127389907837,
"learning_rate": 4.945167923235092e-07,
"loss": 2.063,
"step": 93
},
{
"epoch": 0.1832805264440653,
"grad_norm": 0.2677106559276581,
"learning_rate": 4.941740918437286e-07,
"loss": 2.0257,
"step": 94
},
{
"epoch": 0.18523031927857664,
"grad_norm": 0.2686716318130493,
"learning_rate": 4.938313913639479e-07,
"loss": 2.053,
"step": 95
},
{
"epoch": 0.187180112113088,
"grad_norm": 0.3096849322319031,
"learning_rate": 4.934886908841673e-07,
"loss": 2.0954,
"step": 96
},
{
"epoch": 0.18912990494759932,
"grad_norm": 0.29678693413734436,
"learning_rate": 4.931459904043865e-07,
"loss": 2.0984,
"step": 97
},
{
"epoch": 0.19107969778211065,
"grad_norm": 0.29280567169189453,
"learning_rate": 4.928032899246059e-07,
"loss": 2.1523,
"step": 98
},
{
"epoch": 0.19302949061662197,
"grad_norm": 0.33339405059814453,
"learning_rate": 4.924605894448252e-07,
"loss": 2.1537,
"step": 99
},
{
"epoch": 0.19497928345113333,
"grad_norm": 0.2959805727005005,
"learning_rate": 4.921178889650445e-07,
"loss": 2.07,
"step": 100
},
{
"epoch": 0.19692907628564466,
"grad_norm": 0.2850833535194397,
"learning_rate": 4.917751884852638e-07,
"loss": 2.0565,
"step": 101
},
{
"epoch": 0.19887886912015598,
"grad_norm": 0.27677983045578003,
"learning_rate": 4.914324880054832e-07,
"loss": 2.0252,
"step": 102
},
{
"epoch": 0.2008286619546673,
"grad_norm": 0.2881922423839569,
"learning_rate": 4.910897875257025e-07,
"loss": 2.1085,
"step": 103
},
{
"epoch": 0.20277845478917864,
"grad_norm": 0.28352612257003784,
"learning_rate": 4.907470870459218e-07,
"loss": 2.0758,
"step": 104
},
{
"epoch": 0.20472824762369,
"grad_norm": 0.2815571427345276,
"learning_rate": 4.904043865661412e-07,
"loss": 2.0588,
"step": 105
},
{
"epoch": 0.20667804045820132,
"grad_norm": 0.2817777395248413,
"learning_rate": 4.900616860863605e-07,
"loss": 2.0751,
"step": 106
},
{
"epoch": 0.20862783329271264,
"grad_norm": 0.29829949140548706,
"learning_rate": 4.897189856065798e-07,
"loss": 2.0505,
"step": 107
},
{
"epoch": 0.21057762612722397,
"grad_norm": 0.2886929214000702,
"learning_rate": 4.893762851267992e-07,
"loss": 2.028,
"step": 108
},
{
"epoch": 0.21252741896173533,
"grad_norm": 0.28375059366226196,
"learning_rate": 4.890335846470185e-07,
"loss": 2.0282,
"step": 109
},
{
"epoch": 0.21447721179624665,
"grad_norm": 0.27930572628974915,
"learning_rate": 4.886908841672378e-07,
"loss": 2.1027,
"step": 110
},
{
"epoch": 0.21642700463075798,
"grad_norm": 0.27910512685775757,
"learning_rate": 4.883481836874572e-07,
"loss": 2.1146,
"step": 111
},
{
"epoch": 0.2183767974652693,
"grad_norm": 0.286739319562912,
"learning_rate": 4.880054832076765e-07,
"loss": 2.0727,
"step": 112
},
{
"epoch": 0.22032659029978066,
"grad_norm": 0.2716750502586365,
"learning_rate": 4.876627827278957e-07,
"loss": 2.02,
"step": 113
},
{
"epoch": 0.222276383134292,
"grad_norm": 0.28050121665000916,
"learning_rate": 4.873200822481151e-07,
"loss": 1.9912,
"step": 114
},
{
"epoch": 0.22422617596880332,
"grad_norm": 0.31914082169532776,
"learning_rate": 4.869773817683344e-07,
"loss": 2.0654,
"step": 115
},
{
"epoch": 0.22617596880331464,
"grad_norm": 0.3212663233280182,
"learning_rate": 4.866346812885538e-07,
"loss": 2.1145,
"step": 116
},
{
"epoch": 0.22812576163782597,
"grad_norm": 0.3040018081665039,
"learning_rate": 4.862919808087731e-07,
"loss": 2.1285,
"step": 117
},
{
"epoch": 0.23007555447233732,
"grad_norm": 0.3013773560523987,
"learning_rate": 4.859492803289925e-07,
"loss": 2.0631,
"step": 118
},
{
"epoch": 0.23202534730684865,
"grad_norm": 0.2854544520378113,
"learning_rate": 4.856065798492117e-07,
"loss": 2.0701,
"step": 119
},
{
"epoch": 0.23397514014135998,
"grad_norm": 0.27997076511383057,
"learning_rate": 4.852638793694311e-07,
"loss": 1.9768,
"step": 120
},
{
"epoch": 0.2359249329758713,
"grad_norm": 0.2790175974369049,
"learning_rate": 4.849211788896504e-07,
"loss": 2.0499,
"step": 121
},
{
"epoch": 0.23787472581038266,
"grad_norm": 0.28126639127731323,
"learning_rate": 4.845784784098698e-07,
"loss": 2.0691,
"step": 122
},
{
"epoch": 0.23982451864489399,
"grad_norm": 0.32007864117622375,
"learning_rate": 4.842357779300891e-07,
"loss": 2.0886,
"step": 123
},
{
"epoch": 0.2417743114794053,
"grad_norm": 0.3017228841781616,
"learning_rate": 4.838930774503084e-07,
"loss": 2.0796,
"step": 124
},
{
"epoch": 0.24372410431391664,
"grad_norm": 0.28364625573158264,
"learning_rate": 4.835503769705277e-07,
"loss": 2.0737,
"step": 125
},
{
"epoch": 0.24567389714842797,
"grad_norm": 0.3120713233947754,
"learning_rate": 4.83207676490747e-07,
"loss": 2.0741,
"step": 126
},
{
"epoch": 0.24762368998293932,
"grad_norm": 0.293863445520401,
"learning_rate": 4.828649760109664e-07,
"loss": 1.9777,
"step": 127
},
{
"epoch": 0.24957348281745065,
"grad_norm": 0.2932412326335907,
"learning_rate": 4.825222755311857e-07,
"loss": 2.0567,
"step": 128
},
{
"epoch": 0.251523275651962,
"grad_norm": 0.29689502716064453,
"learning_rate": 4.821795750514051e-07,
"loss": 2.0251,
"step": 129
},
{
"epoch": 0.25347306848647333,
"grad_norm": 0.2953934669494629,
"learning_rate": 4.818368745716243e-07,
"loss": 2.0826,
"step": 130
},
{
"epoch": 0.25542286132098463,
"grad_norm": 0.29008495807647705,
"learning_rate": 4.814941740918437e-07,
"loss": 1.9974,
"step": 131
},
{
"epoch": 0.257372654155496,
"grad_norm": 0.29402440786361694,
"learning_rate": 4.81151473612063e-07,
"loss": 2.1115,
"step": 132
},
{
"epoch": 0.25932244699000734,
"grad_norm": 0.313650906085968,
"learning_rate": 4.808087731322824e-07,
"loss": 2.0834,
"step": 133
},
{
"epoch": 0.26127223982451864,
"grad_norm": 0.2968846261501312,
"learning_rate": 4.804660726525017e-07,
"loss": 2.0786,
"step": 134
},
{
"epoch": 0.26322203265903,
"grad_norm": 0.30427923798561096,
"learning_rate": 4.801233721727211e-07,
"loss": 1.9974,
"step": 135
},
{
"epoch": 0.2651718254935413,
"grad_norm": 0.3112437129020691,
"learning_rate": 4.797806716929403e-07,
"loss": 2.0837,
"step": 136
},
{
"epoch": 0.26712161832805265,
"grad_norm": 0.30960723757743835,
"learning_rate": 4.794379712131597e-07,
"loss": 2.1307,
"step": 137
},
{
"epoch": 0.269071411162564,
"grad_norm": 0.3101617097854614,
"learning_rate": 4.79095270733379e-07,
"loss": 2.0395,
"step": 138
},
{
"epoch": 0.2710212039970753,
"grad_norm": 0.2995094358921051,
"learning_rate": 4.787525702535984e-07,
"loss": 2.0844,
"step": 139
},
{
"epoch": 0.27297099683158665,
"grad_norm": 0.29981735348701477,
"learning_rate": 4.784098697738176e-07,
"loss": 2.0474,
"step": 140
},
{
"epoch": 0.27492078966609795,
"grad_norm": 0.29965049028396606,
"learning_rate": 4.78067169294037e-07,
"loss": 2.0664,
"step": 141
},
{
"epoch": 0.2768705825006093,
"grad_norm": 0.31631559133529663,
"learning_rate": 4.777244688142563e-07,
"loss": 2.0932,
"step": 142
},
{
"epoch": 0.27882037533512066,
"grad_norm": 0.32392817735671997,
"learning_rate": 4.773817683344756e-07,
"loss": 2.0404,
"step": 143
},
{
"epoch": 0.28077016816963196,
"grad_norm": 0.2919900715351105,
"learning_rate": 4.77039067854695e-07,
"loss": 2.0367,
"step": 144
},
{
"epoch": 0.2827199610041433,
"grad_norm": 0.3037238121032715,
"learning_rate": 4.7669636737491434e-07,
"loss": 2.0741,
"step": 145
},
{
"epoch": 0.2846697538386546,
"grad_norm": 0.2894318997859955,
"learning_rate": 4.7635366689513363e-07,
"loss": 2.0676,
"step": 146
},
{
"epoch": 0.28661954667316597,
"grad_norm": 0.3007095158100128,
"learning_rate": 4.760109664153529e-07,
"loss": 2.051,
"step": 147
},
{
"epoch": 0.2885693395076773,
"grad_norm": 0.31736671924591064,
"learning_rate": 4.756682659355723e-07,
"loss": 2.0587,
"step": 148
},
{
"epoch": 0.2905191323421886,
"grad_norm": 0.3223492503166199,
"learning_rate": 4.753255654557916e-07,
"loss": 2.0884,
"step": 149
},
{
"epoch": 0.2924689251767,
"grad_norm": 0.31644171476364136,
"learning_rate": 4.749828649760109e-07,
"loss": 2.128,
"step": 150
},
{
"epoch": 0.29441871801121133,
"grad_norm": 0.3055993914604187,
"learning_rate": 4.746401644962303e-07,
"loss": 2.0597,
"step": 151
},
{
"epoch": 0.29636851084572263,
"grad_norm": 0.3014571964740753,
"learning_rate": 4.742974640164496e-07,
"loss": 2.0674,
"step": 152
},
{
"epoch": 0.298318303680234,
"grad_norm": 0.33088865876197815,
"learning_rate": 4.739547635366689e-07,
"loss": 2.0636,
"step": 153
},
{
"epoch": 0.3002680965147453,
"grad_norm": 0.3139593005180359,
"learning_rate": 4.736120630568883e-07,
"loss": 2.0674,
"step": 154
},
{
"epoch": 0.30221788934925664,
"grad_norm": 0.31804022192955017,
"learning_rate": 4.732693625771076e-07,
"loss": 2.1092,
"step": 155
},
{
"epoch": 0.304167682183768,
"grad_norm": 0.34043845534324646,
"learning_rate": 4.729266620973269e-07,
"loss": 2.0391,
"step": 156
},
{
"epoch": 0.3061174750182793,
"grad_norm": 0.34768176078796387,
"learning_rate": 4.725839616175463e-07,
"loss": 2.0984,
"step": 157
},
{
"epoch": 0.30806726785279065,
"grad_norm": 0.30159029364585876,
"learning_rate": 4.722412611377656e-07,
"loss": 2.0085,
"step": 158
},
{
"epoch": 0.31001706068730195,
"grad_norm": 0.3267905116081238,
"learning_rate": 4.718985606579849e-07,
"loss": 2.0719,
"step": 159
},
{
"epoch": 0.3119668535218133,
"grad_norm": 0.3086291551589966,
"learning_rate": 4.715558601782042e-07,
"loss": 2.0928,
"step": 160
},
{
"epoch": 0.31391664635632466,
"grad_norm": 0.30459094047546387,
"learning_rate": 4.712131596984236e-07,
"loss": 2.1044,
"step": 161
},
{
"epoch": 0.31586643919083596,
"grad_norm": 0.2868260443210602,
"learning_rate": 4.7087045921864287e-07,
"loss": 2.0631,
"step": 162
},
{
"epoch": 0.3178162320253473,
"grad_norm": 0.3526155650615692,
"learning_rate": 4.7052775873886217e-07,
"loss": 2.0573,
"step": 163
},
{
"epoch": 0.31976602485985867,
"grad_norm": 0.3164813220500946,
"learning_rate": 4.7018505825908157e-07,
"loss": 2.1207,
"step": 164
},
{
"epoch": 0.32171581769436997,
"grad_norm": 0.3223491907119751,
"learning_rate": 4.6984235777930086e-07,
"loss": 2.089,
"step": 165
},
{
"epoch": 0.3236656105288813,
"grad_norm": 0.3313138484954834,
"learning_rate": 4.6949965729952016e-07,
"loss": 2.0777,
"step": 166
},
{
"epoch": 0.3256154033633926,
"grad_norm": 0.3372494876384735,
"learning_rate": 4.6915695681973956e-07,
"loss": 2.0185,
"step": 167
},
{
"epoch": 0.327565196197904,
"grad_norm": 0.3191705346107483,
"learning_rate": 4.6881425633995885e-07,
"loss": 2.0505,
"step": 168
},
{
"epoch": 0.32951498903241533,
"grad_norm": 0.32238319516181946,
"learning_rate": 4.6847155586017815e-07,
"loss": 2.126,
"step": 169
},
{
"epoch": 0.3314647818669266,
"grad_norm": 0.31298163533210754,
"learning_rate": 4.6812885538039755e-07,
"loss": 2.1064,
"step": 170
},
{
"epoch": 0.333414574701438,
"grad_norm": 0.3096555471420288,
"learning_rate": 4.6778615490061684e-07,
"loss": 2.0649,
"step": 171
},
{
"epoch": 0.3353643675359493,
"grad_norm": 0.3024272620677948,
"learning_rate": 4.6744345442083614e-07,
"loss": 2.0508,
"step": 172
},
{
"epoch": 0.33731416037046064,
"grad_norm": 0.3325616419315338,
"learning_rate": 4.671007539410555e-07,
"loss": 2.1431,
"step": 173
},
{
"epoch": 0.339263953204972,
"grad_norm": 0.3665126860141754,
"learning_rate": 4.6675805346127483e-07,
"loss": 2.1174,
"step": 174
},
{
"epoch": 0.3412137460394833,
"grad_norm": 0.3292168378829956,
"learning_rate": 4.664153529814941e-07,
"loss": 2.1029,
"step": 175
},
{
"epoch": 0.34316353887399464,
"grad_norm": 0.3286147713661194,
"learning_rate": 4.6607265250171347e-07,
"loss": 2.1042,
"step": 176
},
{
"epoch": 0.34511333170850594,
"grad_norm": 0.32417264580726624,
"learning_rate": 4.657299520219328e-07,
"loss": 2.0901,
"step": 177
},
{
"epoch": 0.3470631245430173,
"grad_norm": 0.31667739152908325,
"learning_rate": 4.653872515421521e-07,
"loss": 2.0895,
"step": 178
},
{
"epoch": 0.34901291737752865,
"grad_norm": 0.3280418813228607,
"learning_rate": 4.6504455106237146e-07,
"loss": 2.1237,
"step": 179
},
{
"epoch": 0.35096271021203995,
"grad_norm": 0.32828444242477417,
"learning_rate": 4.647018505825908e-07,
"loss": 2.0933,
"step": 180
},
{
"epoch": 0.3529125030465513,
"grad_norm": 0.3365094065666199,
"learning_rate": 4.643591501028101e-07,
"loss": 2.1049,
"step": 181
},
{
"epoch": 0.35486229588106266,
"grad_norm": 0.3169403076171875,
"learning_rate": 4.6401644962302945e-07,
"loss": 2.0636,
"step": 182
},
{
"epoch": 0.35681208871557396,
"grad_norm": 0.31843212246894836,
"learning_rate": 4.636737491432488e-07,
"loss": 2.0744,
"step": 183
},
{
"epoch": 0.3587618815500853,
"grad_norm": 0.34016114473342896,
"learning_rate": 4.633310486634681e-07,
"loss": 2.0572,
"step": 184
},
{
"epoch": 0.3607116743845966,
"grad_norm": 0.3435775935649872,
"learning_rate": 4.6298834818368744e-07,
"loss": 2.0702,
"step": 185
},
{
"epoch": 0.36266146721910797,
"grad_norm": 0.32756081223487854,
"learning_rate": 4.6264564770390674e-07,
"loss": 2.0219,
"step": 186
},
{
"epoch": 0.3646112600536193,
"grad_norm": 0.3173263370990753,
"learning_rate": 4.623029472241261e-07,
"loss": 2.0134,
"step": 187
},
{
"epoch": 0.3665610528881306,
"grad_norm": 0.33062443137168884,
"learning_rate": 4.6196024674434543e-07,
"loss": 2.0508,
"step": 188
},
{
"epoch": 0.368510845722642,
"grad_norm": 0.3294820785522461,
"learning_rate": 4.616175462645647e-07,
"loss": 1.9935,
"step": 189
},
{
"epoch": 0.3704606385571533,
"grad_norm": 0.3417966663837433,
"learning_rate": 4.6127484578478407e-07,
"loss": 2.0486,
"step": 190
},
{
"epoch": 0.37241043139166463,
"grad_norm": 0.35238054394721985,
"learning_rate": 4.609321453050034e-07,
"loss": 2.0854,
"step": 191
},
{
"epoch": 0.374360224226176,
"grad_norm": 0.3305458426475525,
"learning_rate": 4.605894448252227e-07,
"loss": 2.0449,
"step": 192
},
{
"epoch": 0.3763100170606873,
"grad_norm": 0.324318528175354,
"learning_rate": 4.6024674434544206e-07,
"loss": 2.1153,
"step": 193
},
{
"epoch": 0.37825980989519864,
"grad_norm": 0.3373543322086334,
"learning_rate": 4.599040438656614e-07,
"loss": 2.0677,
"step": 194
},
{
"epoch": 0.38020960272971,
"grad_norm": 0.345115602016449,
"learning_rate": 4.595613433858807e-07,
"loss": 2.0312,
"step": 195
},
{
"epoch": 0.3821593955642213,
"grad_norm": 0.3340489864349365,
"learning_rate": 4.5921864290610005e-07,
"loss": 1.9848,
"step": 196
},
{
"epoch": 0.38410918839873265,
"grad_norm": 0.3615861237049103,
"learning_rate": 4.588759424263194e-07,
"loss": 2.0471,
"step": 197
},
{
"epoch": 0.38605898123324395,
"grad_norm": 0.3380940854549408,
"learning_rate": 4.585332419465387e-07,
"loss": 2.0481,
"step": 198
},
{
"epoch": 0.3880087740677553,
"grad_norm": 0.3478194773197174,
"learning_rate": 4.58190541466758e-07,
"loss": 2.0324,
"step": 199
},
{
"epoch": 0.38995856690226666,
"grad_norm": 0.34738266468048096,
"learning_rate": 4.578478409869774e-07,
"loss": 2.0864,
"step": 200
},
{
"epoch": 0.39190835973677796,
"grad_norm": 0.3694723844528198,
"learning_rate": 4.575051405071967e-07,
"loss": 2.1574,
"step": 201
},
{
"epoch": 0.3938581525712893,
"grad_norm": 0.3413209617137909,
"learning_rate": 4.57162440027416e-07,
"loss": 2.067,
"step": 202
},
{
"epoch": 0.3958079454058006,
"grad_norm": 0.3256085515022278,
"learning_rate": 4.568197395476354e-07,
"loss": 2.0749,
"step": 203
},
{
"epoch": 0.39775773824031196,
"grad_norm": 0.3281763792037964,
"learning_rate": 4.5647703906785467e-07,
"loss": 2.0431,
"step": 204
},
{
"epoch": 0.3997075310748233,
"grad_norm": 0.3446051776409149,
"learning_rate": 4.5613433858807397e-07,
"loss": 2.011,
"step": 205
},
{
"epoch": 0.4016573239093346,
"grad_norm": 0.3425387442111969,
"learning_rate": 4.5579163810829337e-07,
"loss": 2.0987,
"step": 206
},
{
"epoch": 0.403607116743846,
"grad_norm": 0.33923473954200745,
"learning_rate": 4.5544893762851266e-07,
"loss": 2.0777,
"step": 207
},
{
"epoch": 0.40555690957835727,
"grad_norm": 0.34710973501205444,
"learning_rate": 4.5510623714873196e-07,
"loss": 2.0662,
"step": 208
},
{
"epoch": 0.4075067024128686,
"grad_norm": 0.33852049708366394,
"learning_rate": 4.5476353666895136e-07,
"loss": 2.0872,
"step": 209
},
{
"epoch": 0.40945649524738,
"grad_norm": 0.342153400182724,
"learning_rate": 4.5442083618917065e-07,
"loss": 2.0414,
"step": 210
},
{
"epoch": 0.4114062880818913,
"grad_norm": 0.34867721796035767,
"learning_rate": 4.5407813570938995e-07,
"loss": 2.1128,
"step": 211
},
{
"epoch": 0.41335608091640264,
"grad_norm": 0.33942094445228577,
"learning_rate": 4.537354352296093e-07,
"loss": 2.0786,
"step": 212
},
{
"epoch": 0.415305873750914,
"grad_norm": 0.33538249135017395,
"learning_rate": 4.5339273474982864e-07,
"loss": 2.0332,
"step": 213
},
{
"epoch": 0.4172556665854253,
"grad_norm": 0.34453144669532776,
"learning_rate": 4.5305003427004794e-07,
"loss": 2.0629,
"step": 214
},
{
"epoch": 0.41920545941993664,
"grad_norm": 0.35166001319885254,
"learning_rate": 4.527073337902673e-07,
"loss": 2.0881,
"step": 215
},
{
"epoch": 0.42115525225444794,
"grad_norm": 0.3170466721057892,
"learning_rate": 4.5236463331048663e-07,
"loss": 2.0508,
"step": 216
},
{
"epoch": 0.4231050450889593,
"grad_norm": 0.3201327919960022,
"learning_rate": 4.520219328307059e-07,
"loss": 2.0147,
"step": 217
},
{
"epoch": 0.42505483792347065,
"grad_norm": 0.34361732006073,
"learning_rate": 4.5167923235092527e-07,
"loss": 2.084,
"step": 218
},
{
"epoch": 0.42700463075798195,
"grad_norm": 0.3500427305698395,
"learning_rate": 4.513365318711446e-07,
"loss": 2.0568,
"step": 219
},
{
"epoch": 0.4289544235924933,
"grad_norm": 0.34151604771614075,
"learning_rate": 4.509938313913639e-07,
"loss": 2.0366,
"step": 220
},
{
"epoch": 0.4309042164270046,
"grad_norm": 0.3297358751296997,
"learning_rate": 4.5065113091158326e-07,
"loss": 2.0639,
"step": 221
},
{
"epoch": 0.43285400926151596,
"grad_norm": 0.3623073995113373,
"learning_rate": 4.503084304318026e-07,
"loss": 2.0477,
"step": 222
},
{
"epoch": 0.4348038020960273,
"grad_norm": 0.34618520736694336,
"learning_rate": 4.499657299520219e-07,
"loss": 2.1036,
"step": 223
},
{
"epoch": 0.4367535949305386,
"grad_norm": 0.3289443850517273,
"learning_rate": 4.4962302947224125e-07,
"loss": 2.0026,
"step": 224
},
{
"epoch": 0.43870338776504997,
"grad_norm": 0.3390786349773407,
"learning_rate": 4.4928032899246055e-07,
"loss": 2.0208,
"step": 225
},
{
"epoch": 0.4406531805995613,
"grad_norm": 0.3597511351108551,
"learning_rate": 4.489376285126799e-07,
"loss": 2.1259,
"step": 226
},
{
"epoch": 0.4426029734340726,
"grad_norm": 0.3647196888923645,
"learning_rate": 4.4859492803289924e-07,
"loss": 2.1048,
"step": 227
},
{
"epoch": 0.444552766268584,
"grad_norm": 0.35180747509002686,
"learning_rate": 4.4825222755311854e-07,
"loss": 2.0439,
"step": 228
},
{
"epoch": 0.4465025591030953,
"grad_norm": 0.35504230856895447,
"learning_rate": 4.479095270733379e-07,
"loss": 2.0845,
"step": 229
},
{
"epoch": 0.44845235193760663,
"grad_norm": 0.3500707447528839,
"learning_rate": 4.4756682659355723e-07,
"loss": 2.0717,
"step": 230
},
{
"epoch": 0.450402144772118,
"grad_norm": 0.34788116812705994,
"learning_rate": 4.472241261137765e-07,
"loss": 2.1076,
"step": 231
},
{
"epoch": 0.4523519376066293,
"grad_norm": 0.3553301990032196,
"learning_rate": 4.4688142563399587e-07,
"loss": 2.0512,
"step": 232
},
{
"epoch": 0.45430173044114064,
"grad_norm": 0.3606579005718231,
"learning_rate": 4.465387251542152e-07,
"loss": 2.1154,
"step": 233
},
{
"epoch": 0.45625152327565194,
"grad_norm": 0.3678739368915558,
"learning_rate": 4.461960246744345e-07,
"loss": 2.0755,
"step": 234
},
{
"epoch": 0.4582013161101633,
"grad_norm": 0.3320152461528778,
"learning_rate": 4.4585332419465386e-07,
"loss": 2.0402,
"step": 235
},
{
"epoch": 0.46015110894467465,
"grad_norm": 0.3439280688762665,
"learning_rate": 4.455106237148732e-07,
"loss": 2.0674,
"step": 236
},
{
"epoch": 0.46210090177918595,
"grad_norm": 0.34789469838142395,
"learning_rate": 4.451679232350925e-07,
"loss": 2.0616,
"step": 237
},
{
"epoch": 0.4640506946136973,
"grad_norm": 0.35700955986976624,
"learning_rate": 4.448252227553118e-07,
"loss": 2.0678,
"step": 238
},
{
"epoch": 0.4660004874482086,
"grad_norm": 0.33981651067733765,
"learning_rate": 4.444825222755312e-07,
"loss": 2.0552,
"step": 239
},
{
"epoch": 0.46795028028271995,
"grad_norm": 0.36125004291534424,
"learning_rate": 4.441398217957505e-07,
"loss": 2.0739,
"step": 240
},
{
"epoch": 0.4699000731172313,
"grad_norm": 0.3675917088985443,
"learning_rate": 4.437971213159698e-07,
"loss": 2.0341,
"step": 241
},
{
"epoch": 0.4718498659517426,
"grad_norm": 0.36773043870925903,
"learning_rate": 4.434544208361892e-07,
"loss": 2.1091,
"step": 242
},
{
"epoch": 0.47379965878625396,
"grad_norm": 0.34321659803390503,
"learning_rate": 4.431117203564085e-07,
"loss": 2.0189,
"step": 243
},
{
"epoch": 0.4757494516207653,
"grad_norm": 0.36672836542129517,
"learning_rate": 4.427690198766278e-07,
"loss": 2.064,
"step": 244
},
{
"epoch": 0.4776992444552766,
"grad_norm": 0.3681386411190033,
"learning_rate": 4.424263193968472e-07,
"loss": 2.0895,
"step": 245
},
{
"epoch": 0.47964903728978797,
"grad_norm": 0.36538165807724,
"learning_rate": 4.4208361891706647e-07,
"loss": 2.0361,
"step": 246
},
{
"epoch": 0.48159883012429927,
"grad_norm": 0.3780750036239624,
"learning_rate": 4.4174091843728577e-07,
"loss": 2.053,
"step": 247
},
{
"epoch": 0.4835486229588106,
"grad_norm": 0.3471691310405731,
"learning_rate": 4.4139821795750517e-07,
"loss": 2.0051,
"step": 248
},
{
"epoch": 0.485498415793322,
"grad_norm": 0.36653193831443787,
"learning_rate": 4.4105551747772446e-07,
"loss": 2.1492,
"step": 249
},
{
"epoch": 0.4874482086278333,
"grad_norm": 0.37775489687919617,
"learning_rate": 4.4071281699794376e-07,
"loss": 2.0406,
"step": 250
},
{
"epoch": 0.48939800146234463,
"grad_norm": 0.3678765892982483,
"learning_rate": 4.403701165181631e-07,
"loss": 2.0804,
"step": 251
},
{
"epoch": 0.49134779429685593,
"grad_norm": 0.3415094316005707,
"learning_rate": 4.4002741603838245e-07,
"loss": 2.0187,
"step": 252
},
{
"epoch": 0.4932975871313673,
"grad_norm": 0.3463176190853119,
"learning_rate": 4.3968471555860175e-07,
"loss": 2.0618,
"step": 253
},
{
"epoch": 0.49524737996587864,
"grad_norm": 0.3565087616443634,
"learning_rate": 4.393420150788211e-07,
"loss": 2.0809,
"step": 254
},
{
"epoch": 0.49719717280038994,
"grad_norm": 0.3863977789878845,
"learning_rate": 4.3899931459904044e-07,
"loss": 2.038,
"step": 255
},
{
"epoch": 0.4991469656349013,
"grad_norm": 0.3344396948814392,
"learning_rate": 4.3865661411925974e-07,
"loss": 2.071,
"step": 256
},
{
"epoch": 0.5010967584694126,
"grad_norm": 0.3676479160785675,
"learning_rate": 4.383139136394791e-07,
"loss": 2.0469,
"step": 257
},
{
"epoch": 0.503046551303924,
"grad_norm": 0.36381298303604126,
"learning_rate": 4.3797121315969843e-07,
"loss": 2.0795,
"step": 258
},
{
"epoch": 0.5049963441384353,
"grad_norm": 0.3515491783618927,
"learning_rate": 4.376285126799177e-07,
"loss": 1.9912,
"step": 259
},
{
"epoch": 0.5069461369729467,
"grad_norm": 0.3699260354042053,
"learning_rate": 4.3728581220013707e-07,
"loss": 2.0829,
"step": 260
},
{
"epoch": 0.5088959298074579,
"grad_norm": 0.39030641317367554,
"learning_rate": 4.3694311172035637e-07,
"loss": 2.0917,
"step": 261
},
{
"epoch": 0.5108457226419693,
"grad_norm": 0.35085543990135193,
"learning_rate": 4.366004112405757e-07,
"loss": 2.0517,
"step": 262
},
{
"epoch": 0.5127955154764806,
"grad_norm": 0.3542785048484802,
"learning_rate": 4.3625771076079506e-07,
"loss": 2.0519,
"step": 263
},
{
"epoch": 0.514745308310992,
"grad_norm": 0.36474236845970154,
"learning_rate": 4.3591501028101436e-07,
"loss": 1.9739,
"step": 264
},
{
"epoch": 0.5166951011455033,
"grad_norm": 0.37260621786117554,
"learning_rate": 4.355723098012337e-07,
"loss": 1.9897,
"step": 265
},
{
"epoch": 0.5186448939800147,
"grad_norm": 0.3556238114833832,
"learning_rate": 4.3522960932145305e-07,
"loss": 2.0196,
"step": 266
},
{
"epoch": 0.5205946868145259,
"grad_norm": 0.36310216784477234,
"learning_rate": 4.3488690884167235e-07,
"loss": 2.0151,
"step": 267
},
{
"epoch": 0.5225444796490373,
"grad_norm": 0.37483158707618713,
"learning_rate": 4.345442083618917e-07,
"loss": 2.0929,
"step": 268
},
{
"epoch": 0.5244942724835486,
"grad_norm": 0.3717723786830902,
"learning_rate": 4.3420150788211104e-07,
"loss": 2.1377,
"step": 269
},
{
"epoch": 0.52644406531806,
"grad_norm": 0.34406736493110657,
"learning_rate": 4.3385880740233034e-07,
"loss": 2.0109,
"step": 270
},
{
"epoch": 0.5283938581525713,
"grad_norm": 0.37034499645233154,
"learning_rate": 4.335161069225497e-07,
"loss": 2.0867,
"step": 271
},
{
"epoch": 0.5303436509870826,
"grad_norm": 0.3672201931476593,
"learning_rate": 4.3317340644276903e-07,
"loss": 2.0828,
"step": 272
},
{
"epoch": 0.5322934438215939,
"grad_norm": 0.3954712152481079,
"learning_rate": 4.328307059629883e-07,
"loss": 2.0625,
"step": 273
},
{
"epoch": 0.5342432366561053,
"grad_norm": 0.35529398918151855,
"learning_rate": 4.324880054832076e-07,
"loss": 2.1149,
"step": 274
},
{
"epoch": 0.5361930294906166,
"grad_norm": 0.34687867760658264,
"learning_rate": 4.32145305003427e-07,
"loss": 2.0161,
"step": 275
},
{
"epoch": 0.538142822325128,
"grad_norm": 0.3687521815299988,
"learning_rate": 4.318026045236463e-07,
"loss": 2.0979,
"step": 276
},
{
"epoch": 0.5400926151596392,
"grad_norm": 0.36186617612838745,
"learning_rate": 4.314599040438656e-07,
"loss": 2.0323,
"step": 277
},
{
"epoch": 0.5420424079941506,
"grad_norm": 0.34530189633369446,
"learning_rate": 4.31117203564085e-07,
"loss": 2.0252,
"step": 278
},
{
"epoch": 0.543992200828662,
"grad_norm": 0.36403632164001465,
"learning_rate": 4.307745030843043e-07,
"loss": 2.0518,
"step": 279
},
{
"epoch": 0.5459419936631733,
"grad_norm": 0.4035261869430542,
"learning_rate": 4.304318026045236e-07,
"loss": 2.1648,
"step": 280
},
{
"epoch": 0.5478917864976847,
"grad_norm": 0.36672019958496094,
"learning_rate": 4.30089102124743e-07,
"loss": 2.0564,
"step": 281
},
{
"epoch": 0.5498415793321959,
"grad_norm": 0.386877179145813,
"learning_rate": 4.297464016449623e-07,
"loss": 2.0859,
"step": 282
},
{
"epoch": 0.5517913721667073,
"grad_norm": 0.38155534863471985,
"learning_rate": 4.294037011651816e-07,
"loss": 2.0828,
"step": 283
},
{
"epoch": 0.5537411650012186,
"grad_norm": 0.3724847435951233,
"learning_rate": 4.29061000685401e-07,
"loss": 2.167,
"step": 284
},
{
"epoch": 0.55569095783573,
"grad_norm": 0.3812715411186218,
"learning_rate": 4.287183002056203e-07,
"loss": 2.0624,
"step": 285
},
{
"epoch": 0.5576407506702413,
"grad_norm": 0.365509569644928,
"learning_rate": 4.283755997258396e-07,
"loss": 2.0324,
"step": 286
},
{
"epoch": 0.5595905435047526,
"grad_norm": 0.3624550700187683,
"learning_rate": 4.2803289924605887e-07,
"loss": 2.0274,
"step": 287
},
{
"epoch": 0.5615403363392639,
"grad_norm": 0.38429534435272217,
"learning_rate": 4.2769019876627827e-07,
"loss": 2.0031,
"step": 288
},
{
"epoch": 0.5634901291737753,
"grad_norm": 0.3589562773704529,
"learning_rate": 4.2734749828649757e-07,
"loss": 2.0547,
"step": 289
},
{
"epoch": 0.5654399220082866,
"grad_norm": 0.3625582158565521,
"learning_rate": 4.2700479780671686e-07,
"loss": 2.0044,
"step": 290
},
{
"epoch": 0.567389714842798,
"grad_norm": 0.37126410007476807,
"learning_rate": 4.2666209732693626e-07,
"loss": 2.0788,
"step": 291
},
{
"epoch": 0.5693395076773092,
"grad_norm": 0.36942729353904724,
"learning_rate": 4.2631939684715556e-07,
"loss": 2.0676,
"step": 292
},
{
"epoch": 0.5712893005118206,
"grad_norm": 0.3787277936935425,
"learning_rate": 4.2597669636737485e-07,
"loss": 2.0491,
"step": 293
},
{
"epoch": 0.5732390933463319,
"grad_norm": 0.3843463957309723,
"learning_rate": 4.2563399588759425e-07,
"loss": 2.0657,
"step": 294
},
{
"epoch": 0.5751888861808433,
"grad_norm": 0.384347140789032,
"learning_rate": 4.2529129540781355e-07,
"loss": 2.042,
"step": 295
},
{
"epoch": 0.5771386790153546,
"grad_norm": 0.38822734355926514,
"learning_rate": 4.2494859492803284e-07,
"loss": 2.1084,
"step": 296
},
{
"epoch": 0.579088471849866,
"grad_norm": 0.3850004971027374,
"learning_rate": 4.2460589444825224e-07,
"loss": 2.0527,
"step": 297
},
{
"epoch": 0.5810382646843772,
"grad_norm": 0.3730074167251587,
"learning_rate": 4.2426319396847154e-07,
"loss": 2.0665,
"step": 298
},
{
"epoch": 0.5829880575188886,
"grad_norm": 0.3895587623119354,
"learning_rate": 4.2392049348869083e-07,
"loss": 2.1166,
"step": 299
},
{
"epoch": 0.5849378503534,
"grad_norm": 0.3875929117202759,
"learning_rate": 4.235777930089102e-07,
"loss": 2.1165,
"step": 300
},
{
"epoch": 0.5868876431879113,
"grad_norm": 0.36664247512817383,
"learning_rate": 4.232350925291295e-07,
"loss": 2.0039,
"step": 301
},
{
"epoch": 0.5888374360224227,
"grad_norm": 0.3771498501300812,
"learning_rate": 4.228923920493488e-07,
"loss": 2.0727,
"step": 302
},
{
"epoch": 0.5907872288569339,
"grad_norm": 0.3995096981525421,
"learning_rate": 4.2254969156956817e-07,
"loss": 2.0836,
"step": 303
},
{
"epoch": 0.5927370216914453,
"grad_norm": 0.3781261444091797,
"learning_rate": 4.222069910897875e-07,
"loss": 2.0797,
"step": 304
},
{
"epoch": 0.5946868145259566,
"grad_norm": 0.37572017312049866,
"learning_rate": 4.218642906100068e-07,
"loss": 2.0363,
"step": 305
},
{
"epoch": 0.596636607360468,
"grad_norm": 0.38773536682128906,
"learning_rate": 4.2152159013022616e-07,
"loss": 2.0423,
"step": 306
},
{
"epoch": 0.5985864001949793,
"grad_norm": 0.37952083349227905,
"learning_rate": 4.211788896504455e-07,
"loss": 2.0966,
"step": 307
},
{
"epoch": 0.6005361930294906,
"grad_norm": 0.39403635263442993,
"learning_rate": 4.208361891706648e-07,
"loss": 2.1212,
"step": 308
},
{
"epoch": 0.6024859858640019,
"grad_norm": 0.382625013589859,
"learning_rate": 4.2049348869088415e-07,
"loss": 2.0363,
"step": 309
},
{
"epoch": 0.6044357786985133,
"grad_norm": 0.3843761682510376,
"learning_rate": 4.201507882111035e-07,
"loss": 1.9995,
"step": 310
},
{
"epoch": 0.6063855715330246,
"grad_norm": 0.4082648754119873,
"learning_rate": 4.198080877313228e-07,
"loss": 2.1265,
"step": 311
},
{
"epoch": 0.608335364367536,
"grad_norm": 0.3746339678764343,
"learning_rate": 4.1946538725154214e-07,
"loss": 2.0025,
"step": 312
},
{
"epoch": 0.6102851572020472,
"grad_norm": 0.38548338413238525,
"learning_rate": 4.1912268677176143e-07,
"loss": 2.0764,
"step": 313
},
{
"epoch": 0.6122349500365586,
"grad_norm": 0.3702864944934845,
"learning_rate": 4.187799862919808e-07,
"loss": 2.0788,
"step": 314
},
{
"epoch": 0.6141847428710699,
"grad_norm": 0.3946288824081421,
"learning_rate": 4.184372858122001e-07,
"loss": 2.0877,
"step": 315
},
{
"epoch": 0.6161345357055813,
"grad_norm": 0.3777286410331726,
"learning_rate": 4.180945853324194e-07,
"loss": 1.9863,
"step": 316
},
{
"epoch": 0.6180843285400927,
"grad_norm": 0.40816164016723633,
"learning_rate": 4.1775188485263877e-07,
"loss": 2.0987,
"step": 317
},
{
"epoch": 0.6200341213746039,
"grad_norm": 0.39065074920654297,
"learning_rate": 4.174091843728581e-07,
"loss": 2.0629,
"step": 318
},
{
"epoch": 0.6219839142091153,
"grad_norm": 0.38007447123527527,
"learning_rate": 4.170664838930774e-07,
"loss": 2.0544,
"step": 319
},
{
"epoch": 0.6239337070436266,
"grad_norm": 0.3953652083873749,
"learning_rate": 4.1672378341329676e-07,
"loss": 2.07,
"step": 320
},
{
"epoch": 0.625883499878138,
"grad_norm": 0.38142332434654236,
"learning_rate": 4.163810829335161e-07,
"loss": 2.0495,
"step": 321
},
{
"epoch": 0.6278332927126493,
"grad_norm": 0.40484854578971863,
"learning_rate": 4.160383824537354e-07,
"loss": 2.0341,
"step": 322
},
{
"epoch": 0.6297830855471606,
"grad_norm": 0.4031660556793213,
"learning_rate": 4.1569568197395475e-07,
"loss": 2.0168,
"step": 323
},
{
"epoch": 0.6317328783816719,
"grad_norm": 0.3859906792640686,
"learning_rate": 4.153529814941741e-07,
"loss": 2.051,
"step": 324
},
{
"epoch": 0.6336826712161833,
"grad_norm": 0.37458735704421997,
"learning_rate": 4.150102810143934e-07,
"loss": 2.038,
"step": 325
},
{
"epoch": 0.6356324640506946,
"grad_norm": 0.39573705196380615,
"learning_rate": 4.146675805346127e-07,
"loss": 2.0308,
"step": 326
},
{
"epoch": 0.637582256885206,
"grad_norm": 0.39273601770401,
"learning_rate": 4.143248800548321e-07,
"loss": 2.0746,
"step": 327
},
{
"epoch": 0.6395320497197173,
"grad_norm": 0.39438948035240173,
"learning_rate": 4.139821795750514e-07,
"loss": 2.0568,
"step": 328
},
{
"epoch": 0.6414818425542286,
"grad_norm": 0.3938084840774536,
"learning_rate": 4.1363947909527067e-07,
"loss": 2.0643,
"step": 329
},
{
"epoch": 0.6434316353887399,
"grad_norm": 0.4020846486091614,
"learning_rate": 4.1329677861549007e-07,
"loss": 2.0737,
"step": 330
},
{
"epoch": 0.6453814282232513,
"grad_norm": 0.413841187953949,
"learning_rate": 4.1295407813570937e-07,
"loss": 2.019,
"step": 331
},
{
"epoch": 0.6473312210577626,
"grad_norm": 0.39189133048057556,
"learning_rate": 4.1261137765592866e-07,
"loss": 2.0795,
"step": 332
},
{
"epoch": 0.649281013892274,
"grad_norm": 0.4119293987751007,
"learning_rate": 4.1226867717614806e-07,
"loss": 2.0794,
"step": 333
},
{
"epoch": 0.6512308067267852,
"grad_norm": 0.40321430563926697,
"learning_rate": 4.1192597669636736e-07,
"loss": 2.0249,
"step": 334
},
{
"epoch": 0.6531805995612966,
"grad_norm": 0.39300522208213806,
"learning_rate": 4.1158327621658665e-07,
"loss": 2.0667,
"step": 335
},
{
"epoch": 0.655130392395808,
"grad_norm": 0.39836639165878296,
"learning_rate": 4.1124057573680605e-07,
"loss": 2.037,
"step": 336
},
{
"epoch": 0.6570801852303193,
"grad_norm": 0.41594526171684265,
"learning_rate": 4.1089787525702535e-07,
"loss": 2.0795,
"step": 337
},
{
"epoch": 0.6590299780648307,
"grad_norm": 0.3934768736362457,
"learning_rate": 4.1055517477724464e-07,
"loss": 2.0045,
"step": 338
},
{
"epoch": 0.6609797708993419,
"grad_norm": 0.3954453766345978,
"learning_rate": 4.10212474297464e-07,
"loss": 2.0005,
"step": 339
},
{
"epoch": 0.6629295637338533,
"grad_norm": 0.42002055048942566,
"learning_rate": 4.0986977381768334e-07,
"loss": 2.0781,
"step": 340
},
{
"epoch": 0.6648793565683646,
"grad_norm": 0.3964640200138092,
"learning_rate": 4.0952707333790263e-07,
"loss": 2.1088,
"step": 341
},
{
"epoch": 0.666829149402876,
"grad_norm": 0.3742097318172455,
"learning_rate": 4.09184372858122e-07,
"loss": 2.0201,
"step": 342
},
{
"epoch": 0.6687789422373873,
"grad_norm": 0.40264692902565,
"learning_rate": 4.088416723783413e-07,
"loss": 2.0927,
"step": 343
},
{
"epoch": 0.6707287350718986,
"grad_norm": 0.39995405077934265,
"learning_rate": 4.084989718985606e-07,
"loss": 2.0783,
"step": 344
},
{
"epoch": 0.6726785279064099,
"grad_norm": 0.39974457025527954,
"learning_rate": 4.0815627141877997e-07,
"loss": 2.0613,
"step": 345
},
{
"epoch": 0.6746283207409213,
"grad_norm": 0.39440110325813293,
"learning_rate": 4.078135709389993e-07,
"loss": 2.0963,
"step": 346
},
{
"epoch": 0.6765781135754326,
"grad_norm": 0.40362536907196045,
"learning_rate": 4.074708704592186e-07,
"loss": 2.138,
"step": 347
},
{
"epoch": 0.678527906409944,
"grad_norm": 0.4271102547645569,
"learning_rate": 4.0712816997943796e-07,
"loss": 2.0668,
"step": 348
},
{
"epoch": 0.6804776992444552,
"grad_norm": 0.3873864412307739,
"learning_rate": 4.067854694996573e-07,
"loss": 2.0236,
"step": 349
},
{
"epoch": 0.6824274920789666,
"grad_norm": 0.39676573872566223,
"learning_rate": 4.064427690198766e-07,
"loss": 2.0723,
"step": 350
},
{
"epoch": 0.6843772849134779,
"grad_norm": 0.3926120102405548,
"learning_rate": 4.0610006854009595e-07,
"loss": 2.0193,
"step": 351
},
{
"epoch": 0.6863270777479893,
"grad_norm": 0.3857557773590088,
"learning_rate": 4.0575736806031524e-07,
"loss": 2.0574,
"step": 352
},
{
"epoch": 0.6882768705825006,
"grad_norm": 0.4042007327079773,
"learning_rate": 4.054146675805346e-07,
"loss": 2.0196,
"step": 353
},
{
"epoch": 0.6902266634170119,
"grad_norm": 0.3976573944091797,
"learning_rate": 4.0507196710075394e-07,
"loss": 1.9201,
"step": 354
},
{
"epoch": 0.6921764562515232,
"grad_norm": 0.38179242610931396,
"learning_rate": 4.0472926662097323e-07,
"loss": 2.0551,
"step": 355
},
{
"epoch": 0.6941262490860346,
"grad_norm": 0.4144536256790161,
"learning_rate": 4.043865661411926e-07,
"loss": 2.0633,
"step": 356
},
{
"epoch": 0.696076041920546,
"grad_norm": 0.42070674896240234,
"learning_rate": 4.040438656614119e-07,
"loss": 2.1222,
"step": 357
},
{
"epoch": 0.6980258347550573,
"grad_norm": 0.394010066986084,
"learning_rate": 4.037011651816312e-07,
"loss": 2.0497,
"step": 358
},
{
"epoch": 0.6999756275895687,
"grad_norm": 0.40751656889915466,
"learning_rate": 4.0335846470185057e-07,
"loss": 2.0554,
"step": 359
},
{
"epoch": 0.7019254204240799,
"grad_norm": 0.3723933696746826,
"learning_rate": 4.030157642220699e-07,
"loss": 1.9727,
"step": 360
},
{
"epoch": 0.7038752132585913,
"grad_norm": 0.3941795825958252,
"learning_rate": 4.026730637422892e-07,
"loss": 2.0793,
"step": 361
},
{
"epoch": 0.7058250060931026,
"grad_norm": 0.3988247513771057,
"learning_rate": 4.0233036326250856e-07,
"loss": 2.1244,
"step": 362
},
{
"epoch": 0.707774798927614,
"grad_norm": 0.409525603055954,
"learning_rate": 4.019876627827279e-07,
"loss": 2.0778,
"step": 363
},
{
"epoch": 0.7097245917621253,
"grad_norm": 0.37638112902641296,
"learning_rate": 4.016449623029472e-07,
"loss": 1.9827,
"step": 364
},
{
"epoch": 0.7116743845966366,
"grad_norm": 0.41931676864624023,
"learning_rate": 4.013022618231665e-07,
"loss": 2.0805,
"step": 365
},
{
"epoch": 0.7136241774311479,
"grad_norm": 0.391668438911438,
"learning_rate": 4.009595613433859e-07,
"loss": 2.0695,
"step": 366
},
{
"epoch": 0.7155739702656593,
"grad_norm": 0.4082440733909607,
"learning_rate": 4.006168608636052e-07,
"loss": 2.0232,
"step": 367
},
{
"epoch": 0.7175237631001706,
"grad_norm": 0.41394224762916565,
"learning_rate": 4.002741603838245e-07,
"loss": 2.024,
"step": 368
},
{
"epoch": 0.719473555934682,
"grad_norm": 0.41648924350738525,
"learning_rate": 3.999314599040439e-07,
"loss": 2.0108,
"step": 369
},
{
"epoch": 0.7214233487691932,
"grad_norm": 0.408218652009964,
"learning_rate": 3.995887594242632e-07,
"loss": 2.0712,
"step": 370
},
{
"epoch": 0.7233731416037046,
"grad_norm": 0.39029547572135925,
"learning_rate": 3.9924605894448247e-07,
"loss": 2.0475,
"step": 371
},
{
"epoch": 0.7253229344382159,
"grad_norm": 0.4242095649242401,
"learning_rate": 3.9890335846470187e-07,
"loss": 2.0507,
"step": 372
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.3876708745956421,
"learning_rate": 3.9856065798492117e-07,
"loss": 2.0161,
"step": 373
},
{
"epoch": 0.7292225201072386,
"grad_norm": 0.41027507185935974,
"learning_rate": 3.9821795750514046e-07,
"loss": 2.0544,
"step": 374
},
{
"epoch": 0.7311723129417499,
"grad_norm": 0.4173310697078705,
"learning_rate": 3.9787525702535986e-07,
"loss": 2.0615,
"step": 375
},
{
"epoch": 0.7331221057762612,
"grad_norm": 0.40106937289237976,
"learning_rate": 3.9753255654557916e-07,
"loss": 2.0189,
"step": 376
},
{
"epoch": 0.7350718986107726,
"grad_norm": 0.40185120701789856,
"learning_rate": 3.9718985606579845e-07,
"loss": 2.0624,
"step": 377
},
{
"epoch": 0.737021691445284,
"grad_norm": 0.39821675419807434,
"learning_rate": 3.968471555860178e-07,
"loss": 2.0664,
"step": 378
},
{
"epoch": 0.7389714842797953,
"grad_norm": 0.4365295171737671,
"learning_rate": 3.9650445510623715e-07,
"loss": 2.065,
"step": 379
},
{
"epoch": 0.7409212771143066,
"grad_norm": 0.40240806341171265,
"learning_rate": 3.9616175462645644e-07,
"loss": 2.0526,
"step": 380
},
{
"epoch": 0.7428710699488179,
"grad_norm": 0.4148831069469452,
"learning_rate": 3.958190541466758e-07,
"loss": 2.1255,
"step": 381
},
{
"epoch": 0.7448208627833293,
"grad_norm": 0.4301227033138275,
"learning_rate": 3.9547635366689514e-07,
"loss": 2.0715,
"step": 382
},
{
"epoch": 0.7467706556178406,
"grad_norm": 0.42958423495292664,
"learning_rate": 3.9513365318711443e-07,
"loss": 2.0762,
"step": 383
},
{
"epoch": 0.748720448452352,
"grad_norm": 0.40311166644096375,
"learning_rate": 3.947909527073338e-07,
"loss": 2.0102,
"step": 384
},
{
"epoch": 0.7506702412868632,
"grad_norm": 0.41303250193595886,
"learning_rate": 3.944482522275531e-07,
"loss": 2.0435,
"step": 385
},
{
"epoch": 0.7526200341213746,
"grad_norm": 0.4167964458465576,
"learning_rate": 3.941055517477724e-07,
"loss": 2.0648,
"step": 386
},
{
"epoch": 0.7545698269558859,
"grad_norm": 0.39250755310058594,
"learning_rate": 3.9376285126799177e-07,
"loss": 2.032,
"step": 387
},
{
"epoch": 0.7565196197903973,
"grad_norm": 0.41534167528152466,
"learning_rate": 3.9342015078821106e-07,
"loss": 2.023,
"step": 388
},
{
"epoch": 0.7584694126249086,
"grad_norm": 0.4158441424369812,
"learning_rate": 3.930774503084304e-07,
"loss": 2.1015,
"step": 389
},
{
"epoch": 0.76041920545942,
"grad_norm": 0.39154303073883057,
"learning_rate": 3.9273474982864976e-07,
"loss": 2.0166,
"step": 390
},
{
"epoch": 0.7623689982939312,
"grad_norm": 0.3865329325199127,
"learning_rate": 3.9239204934886905e-07,
"loss": 2.0209,
"step": 391
},
{
"epoch": 0.7643187911284426,
"grad_norm": 0.4046148955821991,
"learning_rate": 3.920493488690884e-07,
"loss": 2.0501,
"step": 392
},
{
"epoch": 0.7662685839629539,
"grad_norm": 0.4096246659755707,
"learning_rate": 3.9170664838930775e-07,
"loss": 2.0377,
"step": 393
},
{
"epoch": 0.7682183767974653,
"grad_norm": 0.40363749861717224,
"learning_rate": 3.9136394790952704e-07,
"loss": 2.0315,
"step": 394
},
{
"epoch": 0.7701681696319767,
"grad_norm": 0.4038202166557312,
"learning_rate": 3.910212474297464e-07,
"loss": 1.9516,
"step": 395
},
{
"epoch": 0.7721179624664879,
"grad_norm": 0.3979615271091461,
"learning_rate": 3.9067854694996574e-07,
"loss": 2.02,
"step": 396
},
{
"epoch": 0.7740677553009992,
"grad_norm": 0.4166601896286011,
"learning_rate": 3.9033584647018503e-07,
"loss": 2.0672,
"step": 397
},
{
"epoch": 0.7760175481355106,
"grad_norm": 0.4038446545600891,
"learning_rate": 3.899931459904044e-07,
"loss": 2.0183,
"step": 398
},
{
"epoch": 0.777967340970022,
"grad_norm": 0.4230453670024872,
"learning_rate": 3.896504455106237e-07,
"loss": 2.0234,
"step": 399
},
{
"epoch": 0.7799171338045333,
"grad_norm": 0.4244215190410614,
"learning_rate": 3.89307745030843e-07,
"loss": 2.0863,
"step": 400
},
{
"epoch": 0.7818669266390446,
"grad_norm": 0.42174607515335083,
"learning_rate": 3.889650445510623e-07,
"loss": 2.0775,
"step": 401
},
{
"epoch": 0.7838167194735559,
"grad_norm": 0.4019846022129059,
"learning_rate": 3.886223440712817e-07,
"loss": 2.0445,
"step": 402
},
{
"epoch": 0.7857665123080673,
"grad_norm": 0.4168083965778351,
"learning_rate": 3.88279643591501e-07,
"loss": 2.0457,
"step": 403
},
{
"epoch": 0.7877163051425786,
"grad_norm": 0.4132064878940582,
"learning_rate": 3.879369431117203e-07,
"loss": 2.0637,
"step": 404
},
{
"epoch": 0.78966609797709,
"grad_norm": 0.4239768981933594,
"learning_rate": 3.875942426319397e-07,
"loss": 2.0512,
"step": 405
},
{
"epoch": 0.7916158908116012,
"grad_norm": 0.4192203879356384,
"learning_rate": 3.87251542152159e-07,
"loss": 2.0766,
"step": 406
},
{
"epoch": 0.7935656836461126,
"grad_norm": 0.4393591582775116,
"learning_rate": 3.869088416723783e-07,
"loss": 2.0497,
"step": 407
},
{
"epoch": 0.7955154764806239,
"grad_norm": 0.417614221572876,
"learning_rate": 3.865661411925977e-07,
"loss": 2.0518,
"step": 408
},
{
"epoch": 0.7974652693151353,
"grad_norm": 0.4034237563610077,
"learning_rate": 3.86223440712817e-07,
"loss": 2.0604,
"step": 409
},
{
"epoch": 0.7994150621496466,
"grad_norm": 0.4287107586860657,
"learning_rate": 3.858807402330363e-07,
"loss": 2.0386,
"step": 410
},
{
"epoch": 0.8013648549841579,
"grad_norm": 0.4140661656856537,
"learning_rate": 3.855380397532557e-07,
"loss": 2.108,
"step": 411
},
{
"epoch": 0.8033146478186692,
"grad_norm": 0.4189471900463104,
"learning_rate": 3.85195339273475e-07,
"loss": 2.0894,
"step": 412
},
{
"epoch": 0.8052644406531806,
"grad_norm": 0.4111238121986389,
"learning_rate": 3.8485263879369427e-07,
"loss": 2.051,
"step": 413
},
{
"epoch": 0.807214233487692,
"grad_norm": 0.4296090006828308,
"learning_rate": 3.845099383139136e-07,
"loss": 2.0484,
"step": 414
},
{
"epoch": 0.8091640263222033,
"grad_norm": 0.4000217020511627,
"learning_rate": 3.8416723783413297e-07,
"loss": 2.0449,
"step": 415
},
{
"epoch": 0.8111138191567145,
"grad_norm": 0.44013938307762146,
"learning_rate": 3.8382453735435226e-07,
"loss": 2.1467,
"step": 416
},
{
"epoch": 0.8130636119912259,
"grad_norm": 0.4252108633518219,
"learning_rate": 3.834818368745716e-07,
"loss": 2.0725,
"step": 417
},
{
"epoch": 0.8150134048257373,
"grad_norm": 0.41153863072395325,
"learning_rate": 3.8313913639479096e-07,
"loss": 2.0829,
"step": 418
},
{
"epoch": 0.8169631976602486,
"grad_norm": 0.417043536901474,
"learning_rate": 3.8279643591501025e-07,
"loss": 1.9899,
"step": 419
},
{
"epoch": 0.81891299049476,
"grad_norm": 0.41520485281944275,
"learning_rate": 3.824537354352296e-07,
"loss": 1.9941,
"step": 420
},
{
"epoch": 0.8208627833292713,
"grad_norm": 0.4316999912261963,
"learning_rate": 3.8211103495544895e-07,
"loss": 2.051,
"step": 421
},
{
"epoch": 0.8228125761637826,
"grad_norm": 0.4300172030925751,
"learning_rate": 3.8176833447566824e-07,
"loss": 2.025,
"step": 422
},
{
"epoch": 0.8247623689982939,
"grad_norm": 0.4366534650325775,
"learning_rate": 3.814256339958876e-07,
"loss": 2.1326,
"step": 423
},
{
"epoch": 0.8267121618328053,
"grad_norm": 0.412256121635437,
"learning_rate": 3.8108293351610694e-07,
"loss": 1.9799,
"step": 424
},
{
"epoch": 0.8286619546673166,
"grad_norm": 0.4404711425304413,
"learning_rate": 3.8074023303632623e-07,
"loss": 2.0618,
"step": 425
},
{
"epoch": 0.830611747501828,
"grad_norm": 0.41743820905685425,
"learning_rate": 3.803975325565456e-07,
"loss": 2.0293,
"step": 426
},
{
"epoch": 0.8325615403363392,
"grad_norm": 0.40452542901039124,
"learning_rate": 3.8005483207676487e-07,
"loss": 2.0561,
"step": 427
},
{
"epoch": 0.8345113331708506,
"grad_norm": 0.41732680797576904,
"learning_rate": 3.797121315969842e-07,
"loss": 1.9826,
"step": 428
},
{
"epoch": 0.8364611260053619,
"grad_norm": 0.43309998512268066,
"learning_rate": 3.7936943111720357e-07,
"loss": 2.0313,
"step": 429
},
{
"epoch": 0.8384109188398733,
"grad_norm": 0.43594348430633545,
"learning_rate": 3.7902673063742286e-07,
"loss": 2.0437,
"step": 430
},
{
"epoch": 0.8403607116743846,
"grad_norm": 0.43290477991104126,
"learning_rate": 3.786840301576422e-07,
"loss": 2.1213,
"step": 431
},
{
"epoch": 0.8423105045088959,
"grad_norm": 0.4143589735031128,
"learning_rate": 3.7834132967786156e-07,
"loss": 2.0327,
"step": 432
},
{
"epoch": 0.8442602973434072,
"grad_norm": 0.4311947226524353,
"learning_rate": 3.7799862919808085e-07,
"loss": 2.0604,
"step": 433
},
{
"epoch": 0.8462100901779186,
"grad_norm": 0.4119859039783478,
"learning_rate": 3.776559287183002e-07,
"loss": 2.0091,
"step": 434
},
{
"epoch": 0.84815988301243,
"grad_norm": 0.4251650869846344,
"learning_rate": 3.7731322823851955e-07,
"loss": 2.05,
"step": 435
},
{
"epoch": 0.8501096758469413,
"grad_norm": 0.4295788109302521,
"learning_rate": 3.7697052775873884e-07,
"loss": 2.0231,
"step": 436
},
{
"epoch": 0.8520594686814525,
"grad_norm": 0.4099411964416504,
"learning_rate": 3.766278272789582e-07,
"loss": 2.1037,
"step": 437
},
{
"epoch": 0.8540092615159639,
"grad_norm": 0.41294169425964355,
"learning_rate": 3.7628512679917754e-07,
"loss": 2.0535,
"step": 438
},
{
"epoch": 0.8559590543504753,
"grad_norm": 0.4004737138748169,
"learning_rate": 3.7594242631939683e-07,
"loss": 2.0395,
"step": 439
},
{
"epoch": 0.8579088471849866,
"grad_norm": 0.40913403034210205,
"learning_rate": 3.755997258396161e-07,
"loss": 1.9947,
"step": 440
},
{
"epoch": 0.859858640019498,
"grad_norm": 0.41119128465652466,
"learning_rate": 3.752570253598355e-07,
"loss": 1.9859,
"step": 441
},
{
"epoch": 0.8618084328540092,
"grad_norm": 0.44417282938957214,
"learning_rate": 3.749143248800548e-07,
"loss": 2.0712,
"step": 442
},
{
"epoch": 0.8637582256885206,
"grad_norm": 0.41587620973587036,
"learning_rate": 3.745716244002741e-07,
"loss": 1.9921,
"step": 443
},
{
"epoch": 0.8657080185230319,
"grad_norm": 0.4235389530658722,
"learning_rate": 3.742289239204935e-07,
"loss": 1.9941,
"step": 444
},
{
"epoch": 0.8676578113575433,
"grad_norm": 0.4219055771827698,
"learning_rate": 3.738862234407128e-07,
"loss": 2.0621,
"step": 445
},
{
"epoch": 0.8696076041920546,
"grad_norm": 0.42184367775917053,
"learning_rate": 3.735435229609321e-07,
"loss": 2.0307,
"step": 446
},
{
"epoch": 0.8715573970265659,
"grad_norm": 0.39649975299835205,
"learning_rate": 3.732008224811515e-07,
"loss": 2.0264,
"step": 447
},
{
"epoch": 0.8735071898610772,
"grad_norm": 0.4187317490577698,
"learning_rate": 3.728581220013708e-07,
"loss": 1.9778,
"step": 448
},
{
"epoch": 0.8754569826955886,
"grad_norm": 0.41368138790130615,
"learning_rate": 3.725154215215901e-07,
"loss": 1.9953,
"step": 449
},
{
"epoch": 0.8774067755300999,
"grad_norm": 0.4397999942302704,
"learning_rate": 3.721727210418095e-07,
"loss": 2.0835,
"step": 450
},
{
"epoch": 0.8793565683646113,
"grad_norm": 0.41927337646484375,
"learning_rate": 3.718300205620288e-07,
"loss": 2.0307,
"step": 451
},
{
"epoch": 0.8813063611991226,
"grad_norm": 0.43216344714164734,
"learning_rate": 3.714873200822481e-07,
"loss": 2.0669,
"step": 452
},
{
"epoch": 0.8832561540336339,
"grad_norm": 0.4566250741481781,
"learning_rate": 3.711446196024674e-07,
"loss": 2.0423,
"step": 453
},
{
"epoch": 0.8852059468681452,
"grad_norm": 0.4399709701538086,
"learning_rate": 3.708019191226868e-07,
"loss": 2.0859,
"step": 454
},
{
"epoch": 0.8871557397026566,
"grad_norm": 0.44788333773612976,
"learning_rate": 3.7045921864290607e-07,
"loss": 2.0349,
"step": 455
},
{
"epoch": 0.889105532537168,
"grad_norm": 0.4182490110397339,
"learning_rate": 3.7011651816312537e-07,
"loss": 1.9921,
"step": 456
},
{
"epoch": 0.8910553253716793,
"grad_norm": 0.4325038194656372,
"learning_rate": 3.6977381768334477e-07,
"loss": 2.0419,
"step": 457
},
{
"epoch": 0.8930051182061906,
"grad_norm": 0.48611199855804443,
"learning_rate": 3.6943111720356406e-07,
"loss": 2.1572,
"step": 458
},
{
"epoch": 0.8949549110407019,
"grad_norm": 0.4303911030292511,
"learning_rate": 3.6908841672378336e-07,
"loss": 2.0137,
"step": 459
},
{
"epoch": 0.8969047038752133,
"grad_norm": 0.4397573173046112,
"learning_rate": 3.6874571624400276e-07,
"loss": 2.0199,
"step": 460
},
{
"epoch": 0.8988544967097246,
"grad_norm": 0.4570363163948059,
"learning_rate": 3.6840301576422205e-07,
"loss": 2.0648,
"step": 461
},
{
"epoch": 0.900804289544236,
"grad_norm": 0.43259698152542114,
"learning_rate": 3.6806031528444135e-07,
"loss": 2.0121,
"step": 462
},
{
"epoch": 0.9027540823787472,
"grad_norm": 0.44078147411346436,
"learning_rate": 3.6771761480466075e-07,
"loss": 2.0422,
"step": 463
},
{
"epoch": 0.9047038752132586,
"grad_norm": 0.4169975519180298,
"learning_rate": 3.6737491432488004e-07,
"loss": 2.0453,
"step": 464
},
{
"epoch": 0.9066536680477699,
"grad_norm": 0.44096165895462036,
"learning_rate": 3.6703221384509934e-07,
"loss": 2.0722,
"step": 465
},
{
"epoch": 0.9086034608822813,
"grad_norm": 0.4220427870750427,
"learning_rate": 3.666895133653187e-07,
"loss": 2.052,
"step": 466
},
{
"epoch": 0.9105532537167926,
"grad_norm": 0.41613534092903137,
"learning_rate": 3.6634681288553803e-07,
"loss": 2.0031,
"step": 467
},
{
"epoch": 0.9125030465513039,
"grad_norm": 0.4290630519390106,
"learning_rate": 3.660041124057573e-07,
"loss": 2.108,
"step": 468
},
{
"epoch": 0.9144528393858152,
"grad_norm": 0.41508668661117554,
"learning_rate": 3.6566141192597667e-07,
"loss": 2.0369,
"step": 469
},
{
"epoch": 0.9164026322203266,
"grad_norm": 0.4051671326160431,
"learning_rate": 3.65318711446196e-07,
"loss": 2.0593,
"step": 470
},
{
"epoch": 0.9183524250548379,
"grad_norm": 0.427229642868042,
"learning_rate": 3.649760109664153e-07,
"loss": 2.0303,
"step": 471
},
{
"epoch": 0.9203022178893493,
"grad_norm": 0.408236026763916,
"learning_rate": 3.6463331048663466e-07,
"loss": 2.0537,
"step": 472
},
{
"epoch": 0.9222520107238605,
"grad_norm": 0.4055333435535431,
"learning_rate": 3.64290610006854e-07,
"loss": 1.9684,
"step": 473
},
{
"epoch": 0.9242018035583719,
"grad_norm": 0.4198017418384552,
"learning_rate": 3.639479095270733e-07,
"loss": 2.0429,
"step": 474
},
{
"epoch": 0.9261515963928832,
"grad_norm": 0.4309008717536926,
"learning_rate": 3.6360520904729265e-07,
"loss": 2.0844,
"step": 475
},
{
"epoch": 0.9281013892273946,
"grad_norm": 0.4177336096763611,
"learning_rate": 3.63262508567512e-07,
"loss": 2.0082,
"step": 476
},
{
"epoch": 0.930051182061906,
"grad_norm": 0.42606329917907715,
"learning_rate": 3.629198080877313e-07,
"loss": 2.0371,
"step": 477
},
{
"epoch": 0.9320009748964172,
"grad_norm": 0.4223528504371643,
"learning_rate": 3.6257710760795064e-07,
"loss": 2.0128,
"step": 478
},
{
"epoch": 0.9339507677309286,
"grad_norm": 0.43999001383781433,
"learning_rate": 3.6223440712816994e-07,
"loss": 1.9984,
"step": 479
},
{
"epoch": 0.9359005605654399,
"grad_norm": 0.44352471828460693,
"learning_rate": 3.618917066483893e-07,
"loss": 2.0501,
"step": 480
},
{
"epoch": 0.9378503533999513,
"grad_norm": 0.4229583740234375,
"learning_rate": 3.6154900616860863e-07,
"loss": 2.0403,
"step": 481
},
{
"epoch": 0.9398001462344626,
"grad_norm": 0.4202549457550049,
"learning_rate": 3.612063056888279e-07,
"loss": 1.9893,
"step": 482
},
{
"epoch": 0.941749939068974,
"grad_norm": 0.4364420771598816,
"learning_rate": 3.6086360520904727e-07,
"loss": 1.9953,
"step": 483
},
{
"epoch": 0.9436997319034852,
"grad_norm": 0.4317263662815094,
"learning_rate": 3.605209047292666e-07,
"loss": 2.0787,
"step": 484
},
{
"epoch": 0.9456495247379966,
"grad_norm": 0.44858187437057495,
"learning_rate": 3.601782042494859e-07,
"loss": 2.1139,
"step": 485
},
{
"epoch": 0.9475993175725079,
"grad_norm": 0.4311455488204956,
"learning_rate": 3.5983550376970526e-07,
"loss": 2.0409,
"step": 486
},
{
"epoch": 0.9495491104070193,
"grad_norm": 0.42990413308143616,
"learning_rate": 3.594928032899246e-07,
"loss": 2.0478,
"step": 487
},
{
"epoch": 0.9514989032415306,
"grad_norm": 0.4484078288078308,
"learning_rate": 3.591501028101439e-07,
"loss": 1.9989,
"step": 488
},
{
"epoch": 0.9534486960760419,
"grad_norm": 0.438047856092453,
"learning_rate": 3.5880740233036325e-07,
"loss": 2.0468,
"step": 489
},
{
"epoch": 0.9553984889105532,
"grad_norm": 0.4557168483734131,
"learning_rate": 3.584647018505826e-07,
"loss": 2.1145,
"step": 490
},
{
"epoch": 0.9573482817450646,
"grad_norm": 0.41166436672210693,
"learning_rate": 3.581220013708019e-07,
"loss": 2.0639,
"step": 491
},
{
"epoch": 0.9592980745795759,
"grad_norm": 0.4612530767917633,
"learning_rate": 3.577793008910212e-07,
"loss": 2.0139,
"step": 492
},
{
"epoch": 0.9612478674140873,
"grad_norm": 0.4352019429206848,
"learning_rate": 3.574366004112406e-07,
"loss": 2.0984,
"step": 493
},
{
"epoch": 0.9631976602485985,
"grad_norm": 0.4246942400932312,
"learning_rate": 3.570938999314599e-07,
"loss": 2.054,
"step": 494
},
{
"epoch": 0.9651474530831099,
"grad_norm": 0.4309667646884918,
"learning_rate": 3.567511994516792e-07,
"loss": 1.9942,
"step": 495
},
{
"epoch": 0.9670972459176213,
"grad_norm": 0.4459112584590912,
"learning_rate": 3.564084989718986e-07,
"loss": 2.0221,
"step": 496
},
{
"epoch": 0.9690470387521326,
"grad_norm": 0.44149142503738403,
"learning_rate": 3.5606579849211787e-07,
"loss": 2.0181,
"step": 497
},
{
"epoch": 0.970996831586644,
"grad_norm": 0.4406503736972809,
"learning_rate": 3.5572309801233717e-07,
"loss": 2.0666,
"step": 498
},
{
"epoch": 0.9729466244211552,
"grad_norm": 0.4117674231529236,
"learning_rate": 3.5538039753255657e-07,
"loss": 1.982,
"step": 499
},
{
"epoch": 0.9748964172556666,
"grad_norm": 0.43600788712501526,
"learning_rate": 3.5503769705277586e-07,
"loss": 1.9772,
"step": 500
},
{
"epoch": 0.9768462100901779,
"grad_norm": 0.42391106486320496,
"learning_rate": 3.5469499657299516e-07,
"loss": 2.0304,
"step": 501
},
{
"epoch": 0.9787960029246893,
"grad_norm": 0.44462934136390686,
"learning_rate": 3.5435229609321456e-07,
"loss": 2.0374,
"step": 502
},
{
"epoch": 0.9807457957592006,
"grad_norm": 0.45238927006721497,
"learning_rate": 3.5400959561343385e-07,
"loss": 2.057,
"step": 503
},
{
"epoch": 0.9826955885937119,
"grad_norm": 0.43034645915031433,
"learning_rate": 3.5366689513365315e-07,
"loss": 2.0392,
"step": 504
},
{
"epoch": 0.9846453814282232,
"grad_norm": 0.42902877926826477,
"learning_rate": 3.533241946538725e-07,
"loss": 2.045,
"step": 505
},
{
"epoch": 0.9865951742627346,
"grad_norm": 0.4340520203113556,
"learning_rate": 3.5298149417409184e-07,
"loss": 2.0439,
"step": 506
},
{
"epoch": 0.9885449670972459,
"grad_norm": 0.45374131202697754,
"learning_rate": 3.5263879369431114e-07,
"loss": 2.0431,
"step": 507
},
{
"epoch": 0.9904947599317573,
"grad_norm": 0.44037064909935,
"learning_rate": 3.522960932145305e-07,
"loss": 2.0123,
"step": 508
},
{
"epoch": 0.9924445527662685,
"grad_norm": 0.42846593260765076,
"learning_rate": 3.5195339273474983e-07,
"loss": 1.9661,
"step": 509
},
{
"epoch": 0.9943943456007799,
"grad_norm": 0.4789009392261505,
"learning_rate": 3.516106922549691e-07,
"loss": 2.0753,
"step": 510
},
{
"epoch": 0.9963441384352912,
"grad_norm": 0.44283124804496765,
"learning_rate": 3.5126799177518847e-07,
"loss": 2.0581,
"step": 511
},
{
"epoch": 0.9982939312698026,
"grad_norm": 0.43828728795051575,
"learning_rate": 3.509252912954078e-07,
"loss": 2.05,
"step": 512
},
{
"epoch": 0.9982939312698026,
"eval_loss": 2.046032667160034,
"eval_runtime": 481.0273,
"eval_samples_per_second": 1.293,
"eval_steps_per_second": 0.324,
"step": 512
},
{
"epoch": 1.0002437241043138,
"grad_norm": 0.41433945298194885,
"learning_rate": 3.505825908156271e-07,
"loss": 2.0366,
"step": 513
},
{
"epoch": 1.0021935169388252,
"grad_norm": 0.42399510741233826,
"learning_rate": 3.5023989033584646e-07,
"loss": 1.991,
"step": 514
},
{
"epoch": 1.0041433097733365,
"grad_norm": 0.45652541518211365,
"learning_rate": 3.4989718985606576e-07,
"loss": 2.0066,
"step": 515
},
{
"epoch": 1.006093102607848,
"grad_norm": 0.43585795164108276,
"learning_rate": 3.495544893762851e-07,
"loss": 2.0025,
"step": 516
},
{
"epoch": 1.0080428954423593,
"grad_norm": 0.43803489208221436,
"learning_rate": 3.4921178889650445e-07,
"loss": 2.0654,
"step": 517
},
{
"epoch": 1.0099926882768706,
"grad_norm": 0.43803176283836365,
"learning_rate": 3.4886908841672375e-07,
"loss": 2.0896,
"step": 518
},
{
"epoch": 1.011942481111382,
"grad_norm": 0.41983944177627563,
"learning_rate": 3.485263879369431e-07,
"loss": 2.0335,
"step": 519
},
{
"epoch": 1.0138922739458933,
"grad_norm": 0.4354363977909088,
"learning_rate": 3.4818368745716244e-07,
"loss": 2.0699,
"step": 520
},
{
"epoch": 1.0158420667804047,
"grad_norm": 0.42140671610832214,
"learning_rate": 3.4784098697738174e-07,
"loss": 1.9646,
"step": 521
},
{
"epoch": 1.0177918596149158,
"grad_norm": 0.4265493154525757,
"learning_rate": 3.474982864976011e-07,
"loss": 2.0735,
"step": 522
},
{
"epoch": 1.0197416524494272,
"grad_norm": 0.43847259879112244,
"learning_rate": 3.4715558601782043e-07,
"loss": 2.0986,
"step": 523
},
{
"epoch": 1.0216914452839385,
"grad_norm": 0.4600801467895508,
"learning_rate": 3.468128855380397e-07,
"loss": 2.0643,
"step": 524
},
{
"epoch": 1.0236412381184499,
"grad_norm": 0.42904648184776306,
"learning_rate": 3.4647018505825907e-07,
"loss": 2.0056,
"step": 525
},
{
"epoch": 1.0255910309529612,
"grad_norm": 0.46431151032447815,
"learning_rate": 3.461274845784784e-07,
"loss": 2.1056,
"step": 526
},
{
"epoch": 1.0275408237874726,
"grad_norm": 0.455836683511734,
"learning_rate": 3.457847840986977e-07,
"loss": 2.0187,
"step": 527
},
{
"epoch": 1.029490616621984,
"grad_norm": 0.4192461669445038,
"learning_rate": 3.45442083618917e-07,
"loss": 2.0832,
"step": 528
},
{
"epoch": 1.0314404094564953,
"grad_norm": 0.4513595402240753,
"learning_rate": 3.450993831391364e-07,
"loss": 2.058,
"step": 529
},
{
"epoch": 1.0333902022910066,
"grad_norm": 0.4370152950286865,
"learning_rate": 3.447566826593557e-07,
"loss": 2.0537,
"step": 530
},
{
"epoch": 1.035339995125518,
"grad_norm": 0.4199161231517792,
"learning_rate": 3.44413982179575e-07,
"loss": 1.9518,
"step": 531
},
{
"epoch": 1.0372897879600294,
"grad_norm": 0.43688762187957764,
"learning_rate": 3.440712816997944e-07,
"loss": 2.0444,
"step": 532
},
{
"epoch": 1.0392395807945405,
"grad_norm": 0.49809253215789795,
"learning_rate": 3.437285812200137e-07,
"loss": 2.0401,
"step": 533
},
{
"epoch": 1.0411893736290518,
"grad_norm": 0.4518781900405884,
"learning_rate": 3.43385880740233e-07,
"loss": 2.0605,
"step": 534
},
{
"epoch": 1.0431391664635632,
"grad_norm": 0.45353132486343384,
"learning_rate": 3.430431802604524e-07,
"loss": 2.0402,
"step": 535
},
{
"epoch": 1.0450889592980745,
"grad_norm": 0.4396359622478485,
"learning_rate": 3.427004797806717e-07,
"loss": 2.0643,
"step": 536
},
{
"epoch": 1.047038752132586,
"grad_norm": 0.4434252083301544,
"learning_rate": 3.42357779300891e-07,
"loss": 2.0188,
"step": 537
},
{
"epoch": 1.0489885449670973,
"grad_norm": 0.4241044819355011,
"learning_rate": 3.420150788211104e-07,
"loss": 1.9556,
"step": 538
},
{
"epoch": 1.0509383378016086,
"grad_norm": 0.4382232129573822,
"learning_rate": 3.4167237834132967e-07,
"loss": 1.9855,
"step": 539
},
{
"epoch": 1.05288813063612,
"grad_norm": 0.4357564151287079,
"learning_rate": 3.4132967786154897e-07,
"loss": 2.0524,
"step": 540
},
{
"epoch": 1.0548379234706313,
"grad_norm": 0.46050140261650085,
"learning_rate": 3.409869773817683e-07,
"loss": 2.0461,
"step": 541
},
{
"epoch": 1.0567877163051427,
"grad_norm": 0.44581982493400574,
"learning_rate": 3.4064427690198766e-07,
"loss": 1.9955,
"step": 542
},
{
"epoch": 1.0587375091396538,
"grad_norm": 0.4502599835395813,
"learning_rate": 3.4030157642220696e-07,
"loss": 2.0864,
"step": 543
},
{
"epoch": 1.0606873019741652,
"grad_norm": 0.44767019152641296,
"learning_rate": 3.399588759424263e-07,
"loss": 2.0447,
"step": 544
},
{
"epoch": 1.0626370948086765,
"grad_norm": 0.44603490829467773,
"learning_rate": 3.3961617546264565e-07,
"loss": 2.0709,
"step": 545
},
{
"epoch": 1.0645868876431879,
"grad_norm": 0.4321264922618866,
"learning_rate": 3.3927347498286495e-07,
"loss": 2.0157,
"step": 546
},
{
"epoch": 1.0665366804776992,
"grad_norm": 0.4479556083679199,
"learning_rate": 3.389307745030843e-07,
"loss": 2.1088,
"step": 547
},
{
"epoch": 1.0684864733122106,
"grad_norm": 0.4292636513710022,
"learning_rate": 3.3858807402330364e-07,
"loss": 2.0847,
"step": 548
},
{
"epoch": 1.070436266146722,
"grad_norm": 0.43631821870803833,
"learning_rate": 3.3824537354352294e-07,
"loss": 2.034,
"step": 549
},
{
"epoch": 1.0723860589812333,
"grad_norm": 0.43201327323913574,
"learning_rate": 3.379026730637423e-07,
"loss": 1.9633,
"step": 550
},
{
"epoch": 1.0743358518157446,
"grad_norm": 0.4389747679233551,
"learning_rate": 3.3755997258396163e-07,
"loss": 2.0331,
"step": 551
},
{
"epoch": 1.076285644650256,
"grad_norm": 0.46588924527168274,
"learning_rate": 3.372172721041809e-07,
"loss": 2.0748,
"step": 552
},
{
"epoch": 1.0782354374847674,
"grad_norm": 0.45190852880477905,
"learning_rate": 3.3687457162440027e-07,
"loss": 1.9639,
"step": 553
},
{
"epoch": 1.0801852303192785,
"grad_norm": 0.4458979070186615,
"learning_rate": 3.3653187114461957e-07,
"loss": 2.1124,
"step": 554
},
{
"epoch": 1.0821350231537898,
"grad_norm": 0.40400832891464233,
"learning_rate": 3.361891706648389e-07,
"loss": 1.9776,
"step": 555
},
{
"epoch": 1.0840848159883012,
"grad_norm": 0.4538462460041046,
"learning_rate": 3.3584647018505826e-07,
"loss": 1.9962,
"step": 556
},
{
"epoch": 1.0860346088228126,
"grad_norm": 0.44181132316589355,
"learning_rate": 3.3550376970527756e-07,
"loss": 2.0973,
"step": 557
},
{
"epoch": 1.087984401657324,
"grad_norm": 0.43516308069229126,
"learning_rate": 3.351610692254969e-07,
"loss": 1.9923,
"step": 558
},
{
"epoch": 1.0899341944918353,
"grad_norm": 0.4485546052455902,
"learning_rate": 3.3481836874571625e-07,
"loss": 2.0242,
"step": 559
},
{
"epoch": 1.0918839873263466,
"grad_norm": 0.45358070731163025,
"learning_rate": 3.3447566826593555e-07,
"loss": 2.0603,
"step": 560
},
{
"epoch": 1.093833780160858,
"grad_norm": 0.43879690766334534,
"learning_rate": 3.341329677861549e-07,
"loss": 1.9869,
"step": 561
},
{
"epoch": 1.0957835729953693,
"grad_norm": 0.4376320242881775,
"learning_rate": 3.3379026730637424e-07,
"loss": 2.0447,
"step": 562
},
{
"epoch": 1.0977333658298805,
"grad_norm": 0.4591986835002899,
"learning_rate": 3.3344756682659354e-07,
"loss": 2.0188,
"step": 563
},
{
"epoch": 1.0996831586643918,
"grad_norm": 0.4306589961051941,
"learning_rate": 3.331048663468129e-07,
"loss": 2.0223,
"step": 564
},
{
"epoch": 1.1016329514989032,
"grad_norm": 0.43692710995674133,
"learning_rate": 3.3276216586703223e-07,
"loss": 2.0507,
"step": 565
},
{
"epoch": 1.1035827443334145,
"grad_norm": 0.4663935601711273,
"learning_rate": 3.324194653872515e-07,
"loss": 2.0444,
"step": 566
},
{
"epoch": 1.1055325371679259,
"grad_norm": 0.45090562105178833,
"learning_rate": 3.320767649074708e-07,
"loss": 1.9944,
"step": 567
},
{
"epoch": 1.1074823300024372,
"grad_norm": 0.4450632631778717,
"learning_rate": 3.317340644276902e-07,
"loss": 2.0264,
"step": 568
},
{
"epoch": 1.1094321228369486,
"grad_norm": 0.45126745104789734,
"learning_rate": 3.313913639479095e-07,
"loss": 2.081,
"step": 569
},
{
"epoch": 1.11138191567146,
"grad_norm": 0.44254472851753235,
"learning_rate": 3.310486634681288e-07,
"loss": 2.0223,
"step": 570
},
{
"epoch": 1.1133317085059713,
"grad_norm": 0.43211621046066284,
"learning_rate": 3.307059629883482e-07,
"loss": 2.0363,
"step": 571
},
{
"epoch": 1.1152815013404827,
"grad_norm": 0.4256265163421631,
"learning_rate": 3.303632625085675e-07,
"loss": 2.0363,
"step": 572
},
{
"epoch": 1.117231294174994,
"grad_norm": 0.4462417960166931,
"learning_rate": 3.300205620287868e-07,
"loss": 2.0394,
"step": 573
},
{
"epoch": 1.1191810870095051,
"grad_norm": 0.4583437442779541,
"learning_rate": 3.296778615490062e-07,
"loss": 2.0878,
"step": 574
},
{
"epoch": 1.1211308798440165,
"grad_norm": 0.4595088064670563,
"learning_rate": 3.293351610692255e-07,
"loss": 2.111,
"step": 575
},
{
"epoch": 1.1230806726785278,
"grad_norm": 0.4117080569267273,
"learning_rate": 3.289924605894448e-07,
"loss": 1.999,
"step": 576
},
{
"epoch": 1.1250304655130392,
"grad_norm": 0.4381641149520874,
"learning_rate": 3.286497601096642e-07,
"loss": 2.044,
"step": 577
},
{
"epoch": 1.1269802583475506,
"grad_norm": 0.43854039907455444,
"learning_rate": 3.283070596298835e-07,
"loss": 2.0272,
"step": 578
},
{
"epoch": 1.128930051182062,
"grad_norm": 0.4721965789794922,
"learning_rate": 3.279643591501028e-07,
"loss": 2.0697,
"step": 579
},
{
"epoch": 1.1308798440165733,
"grad_norm": 0.4373783767223358,
"learning_rate": 3.2762165867032207e-07,
"loss": 2.0102,
"step": 580
},
{
"epoch": 1.1328296368510846,
"grad_norm": 0.4286502003669739,
"learning_rate": 3.2727895819054147e-07,
"loss": 1.9695,
"step": 581
},
{
"epoch": 1.134779429685596,
"grad_norm": 0.4373305141925812,
"learning_rate": 3.2693625771076077e-07,
"loss": 1.9823,
"step": 582
},
{
"epoch": 1.1367292225201073,
"grad_norm": 0.4659106433391571,
"learning_rate": 3.2659355723098006e-07,
"loss": 2.081,
"step": 583
},
{
"epoch": 1.1386790153546187,
"grad_norm": 0.4315546154975891,
"learning_rate": 3.2625085675119946e-07,
"loss": 2.0336,
"step": 584
},
{
"epoch": 1.1406288081891298,
"grad_norm": 0.4512901306152344,
"learning_rate": 3.2590815627141876e-07,
"loss": 2.0642,
"step": 585
},
{
"epoch": 1.1425786010236412,
"grad_norm": 0.4398232400417328,
"learning_rate": 3.2556545579163805e-07,
"loss": 2.0401,
"step": 586
},
{
"epoch": 1.1445283938581525,
"grad_norm": 0.45262405276298523,
"learning_rate": 3.2522275531185745e-07,
"loss": 2.0999,
"step": 587
},
{
"epoch": 1.1464781866926639,
"grad_norm": 0.4210640490055084,
"learning_rate": 3.2488005483207675e-07,
"loss": 1.992,
"step": 588
},
{
"epoch": 1.1484279795271752,
"grad_norm": 0.4530121386051178,
"learning_rate": 3.2453735435229604e-07,
"loss": 2.0119,
"step": 589
},
{
"epoch": 1.1503777723616866,
"grad_norm": 0.43637722730636597,
"learning_rate": 3.2419465387251544e-07,
"loss": 2.0022,
"step": 590
},
{
"epoch": 1.152327565196198,
"grad_norm": 0.46872228384017944,
"learning_rate": 3.2385195339273474e-07,
"loss": 2.0545,
"step": 591
},
{
"epoch": 1.1542773580307093,
"grad_norm": 0.45964333415031433,
"learning_rate": 3.2350925291295403e-07,
"loss": 2.0313,
"step": 592
},
{
"epoch": 1.1562271508652207,
"grad_norm": 0.4444529414176941,
"learning_rate": 3.231665524331734e-07,
"loss": 2.0463,
"step": 593
},
{
"epoch": 1.1581769436997318,
"grad_norm": 0.4702310264110565,
"learning_rate": 3.228238519533927e-07,
"loss": 2.0055,
"step": 594
},
{
"epoch": 1.1601267365342431,
"grad_norm": 0.4435891807079315,
"learning_rate": 3.22481151473612e-07,
"loss": 2.1027,
"step": 595
},
{
"epoch": 1.1620765293687545,
"grad_norm": 0.4557732343673706,
"learning_rate": 3.2213845099383137e-07,
"loss": 2.0307,
"step": 596
},
{
"epoch": 1.1640263222032659,
"grad_norm": 0.4286348819732666,
"learning_rate": 3.217957505140507e-07,
"loss": 2.0196,
"step": 597
},
{
"epoch": 1.1659761150377772,
"grad_norm": 0.4475346803665161,
"learning_rate": 3.2145305003427e-07,
"loss": 2.1014,
"step": 598
},
{
"epoch": 1.1679259078722886,
"grad_norm": 0.418293297290802,
"learning_rate": 3.2111034955448936e-07,
"loss": 2.078,
"step": 599
},
{
"epoch": 1.1698757007068,
"grad_norm": 0.42740973830223083,
"learning_rate": 3.207676490747087e-07,
"loss": 1.9695,
"step": 600
},
{
"epoch": 1.1718254935413113,
"grad_norm": 0.41325512528419495,
"learning_rate": 3.20424948594928e-07,
"loss": 2.0297,
"step": 601
},
{
"epoch": 1.1737752863758226,
"grad_norm": 0.4326270818710327,
"learning_rate": 3.2008224811514735e-07,
"loss": 2.0059,
"step": 602
},
{
"epoch": 1.175725079210334,
"grad_norm": 0.44774889945983887,
"learning_rate": 3.197395476353667e-07,
"loss": 2.0427,
"step": 603
},
{
"epoch": 1.1776748720448453,
"grad_norm": 0.446158766746521,
"learning_rate": 3.19396847155586e-07,
"loss": 2.0748,
"step": 604
},
{
"epoch": 1.1796246648793565,
"grad_norm": 0.4654727280139923,
"learning_rate": 3.1905414667580534e-07,
"loss": 1.9297,
"step": 605
},
{
"epoch": 1.1815744577138678,
"grad_norm": 0.45213672518730164,
"learning_rate": 3.1871144619602463e-07,
"loss": 2.1087,
"step": 606
},
{
"epoch": 1.1835242505483792,
"grad_norm": 0.45228397846221924,
"learning_rate": 3.18368745716244e-07,
"loss": 2.0961,
"step": 607
},
{
"epoch": 1.1854740433828905,
"grad_norm": 0.4470541477203369,
"learning_rate": 3.180260452364633e-07,
"loss": 2.0073,
"step": 608
},
{
"epoch": 1.1874238362174019,
"grad_norm": 0.4324132204055786,
"learning_rate": 3.176833447566826e-07,
"loss": 2.0334,
"step": 609
},
{
"epoch": 1.1893736290519132,
"grad_norm": 0.47044241428375244,
"learning_rate": 3.1734064427690197e-07,
"loss": 2.1086,
"step": 610
},
{
"epoch": 1.1913234218864246,
"grad_norm": 0.43018707633018494,
"learning_rate": 3.169979437971213e-07,
"loss": 2.0289,
"step": 611
},
{
"epoch": 1.193273214720936,
"grad_norm": 0.44133853912353516,
"learning_rate": 3.166552433173406e-07,
"loss": 2.0333,
"step": 612
},
{
"epoch": 1.1952230075554473,
"grad_norm": 0.45557719469070435,
"learning_rate": 3.1631254283755996e-07,
"loss": 2.0399,
"step": 613
},
{
"epoch": 1.1971728003899587,
"grad_norm": 0.4350452125072479,
"learning_rate": 3.159698423577793e-07,
"loss": 2.0224,
"step": 614
},
{
"epoch": 1.19912259322447,
"grad_norm": 0.4687999188899994,
"learning_rate": 3.156271418779986e-07,
"loss": 2.0228,
"step": 615
},
{
"epoch": 1.2010723860589811,
"grad_norm": 0.43684178590774536,
"learning_rate": 3.1528444139821795e-07,
"loss": 2.0776,
"step": 616
},
{
"epoch": 1.2030221788934925,
"grad_norm": 0.45561161637306213,
"learning_rate": 3.149417409184373e-07,
"loss": 2.0022,
"step": 617
},
{
"epoch": 1.2049719717280039,
"grad_norm": 0.4689810276031494,
"learning_rate": 3.145990404386566e-07,
"loss": 2.0173,
"step": 618
},
{
"epoch": 1.2069217645625152,
"grad_norm": 0.4293496310710907,
"learning_rate": 3.142563399588759e-07,
"loss": 1.9824,
"step": 619
},
{
"epoch": 1.2088715573970266,
"grad_norm": 0.4662802219390869,
"learning_rate": 3.139136394790953e-07,
"loss": 2.0784,
"step": 620
},
{
"epoch": 1.210821350231538,
"grad_norm": 0.45310187339782715,
"learning_rate": 3.135709389993146e-07,
"loss": 1.9844,
"step": 621
},
{
"epoch": 1.2127711430660493,
"grad_norm": 0.4419795870780945,
"learning_rate": 3.1322823851953387e-07,
"loss": 2.0515,
"step": 622
},
{
"epoch": 1.2147209359005606,
"grad_norm": 0.4516865611076355,
"learning_rate": 3.1288553803975327e-07,
"loss": 2.0879,
"step": 623
},
{
"epoch": 1.216670728735072,
"grad_norm": 0.46178489923477173,
"learning_rate": 3.1254283755997257e-07,
"loss": 2.0498,
"step": 624
},
{
"epoch": 1.2186205215695831,
"grad_norm": 0.4678952097892761,
"learning_rate": 3.1220013708019186e-07,
"loss": 2.0408,
"step": 625
},
{
"epoch": 1.2205703144040945,
"grad_norm": 0.4456236660480499,
"learning_rate": 3.1185743660041126e-07,
"loss": 1.9694,
"step": 626
},
{
"epoch": 1.2225201072386058,
"grad_norm": 0.4397581219673157,
"learning_rate": 3.1151473612063056e-07,
"loss": 2.0048,
"step": 627
},
{
"epoch": 1.2244699000731172,
"grad_norm": 0.4338027238845825,
"learning_rate": 3.1117203564084985e-07,
"loss": 2.0194,
"step": 628
},
{
"epoch": 1.2264196929076285,
"grad_norm": 0.4413823187351227,
"learning_rate": 3.108293351610692e-07,
"loss": 2.025,
"step": 629
},
{
"epoch": 1.2283694857421399,
"grad_norm": 0.43685299158096313,
"learning_rate": 3.1048663468128855e-07,
"loss": 2.0051,
"step": 630
},
{
"epoch": 1.2303192785766512,
"grad_norm": 0.4644426107406616,
"learning_rate": 3.1014393420150784e-07,
"loss": 2.0313,
"step": 631
},
{
"epoch": 1.2322690714111626,
"grad_norm": 0.4478755593299866,
"learning_rate": 3.098012337217272e-07,
"loss": 1.9669,
"step": 632
},
{
"epoch": 1.234218864245674,
"grad_norm": 0.43452218174934387,
"learning_rate": 3.0945853324194654e-07,
"loss": 1.9927,
"step": 633
},
{
"epoch": 1.2361686570801853,
"grad_norm": 0.4408141076564789,
"learning_rate": 3.0911583276216583e-07,
"loss": 2.136,
"step": 634
},
{
"epoch": 1.2381184499146967,
"grad_norm": 0.42754924297332764,
"learning_rate": 3.087731322823852e-07,
"loss": 2.0247,
"step": 635
},
{
"epoch": 1.2400682427492078,
"grad_norm": 0.4387798607349396,
"learning_rate": 3.084304318026045e-07,
"loss": 1.9643,
"step": 636
},
{
"epoch": 1.2420180355837191,
"grad_norm": 0.46978920698165894,
"learning_rate": 3.080877313228238e-07,
"loss": 2.0776,
"step": 637
},
{
"epoch": 1.2439678284182305,
"grad_norm": 0.41821563243865967,
"learning_rate": 3.0774503084304317e-07,
"loss": 2.0355,
"step": 638
},
{
"epoch": 1.2459176212527419,
"grad_norm": 0.4664837419986725,
"learning_rate": 3.074023303632625e-07,
"loss": 2.0328,
"step": 639
},
{
"epoch": 1.2478674140872532,
"grad_norm": 0.4467378258705139,
"learning_rate": 3.070596298834818e-07,
"loss": 2.0058,
"step": 640
},
{
"epoch": 1.2498172069217646,
"grad_norm": 0.442058265209198,
"learning_rate": 3.0671692940370116e-07,
"loss": 2.0565,
"step": 641
},
{
"epoch": 1.251766999756276,
"grad_norm": 0.4655166268348694,
"learning_rate": 3.0637422892392045e-07,
"loss": 2.0628,
"step": 642
},
{
"epoch": 1.2537167925907873,
"grad_norm": 0.4388466477394104,
"learning_rate": 3.060315284441398e-07,
"loss": 2.0716,
"step": 643
},
{
"epoch": 1.2556665854252986,
"grad_norm": 0.48705416917800903,
"learning_rate": 3.0568882796435915e-07,
"loss": 1.9872,
"step": 644
},
{
"epoch": 1.2576163782598098,
"grad_norm": 0.4618842899799347,
"learning_rate": 3.0534612748457844e-07,
"loss": 2.0306,
"step": 645
},
{
"epoch": 1.2595661710943213,
"grad_norm": 0.46533843874931335,
"learning_rate": 3.050034270047978e-07,
"loss": 2.0827,
"step": 646
},
{
"epoch": 1.2615159639288325,
"grad_norm": 0.4898700714111328,
"learning_rate": 3.0466072652501714e-07,
"loss": 1.9585,
"step": 647
},
{
"epoch": 1.2634657567633438,
"grad_norm": 0.4561532735824585,
"learning_rate": 3.0431802604523643e-07,
"loss": 2.0689,
"step": 648
},
{
"epoch": 1.2654155495978552,
"grad_norm": 0.4628736078739166,
"learning_rate": 3.039753255654558e-07,
"loss": 2.0307,
"step": 649
},
{
"epoch": 1.2673653424323665,
"grad_norm": 0.4475798010826111,
"learning_rate": 3.036326250856751e-07,
"loss": 2.0372,
"step": 650
},
{
"epoch": 1.269315135266878,
"grad_norm": 0.44448035955429077,
"learning_rate": 3.032899246058944e-07,
"loss": 2.0334,
"step": 651
},
{
"epoch": 1.2712649281013892,
"grad_norm": 0.4554859697818756,
"learning_rate": 3.0294722412611377e-07,
"loss": 2.0487,
"step": 652
},
{
"epoch": 1.2732147209359006,
"grad_norm": 0.44150403141975403,
"learning_rate": 3.026045236463331e-07,
"loss": 2.085,
"step": 653
},
{
"epoch": 1.275164513770412,
"grad_norm": 0.4476960301399231,
"learning_rate": 3.022618231665524e-07,
"loss": 1.9762,
"step": 654
},
{
"epoch": 1.2771143066049233,
"grad_norm": 0.4773290753364563,
"learning_rate": 3.019191226867717e-07,
"loss": 2.0565,
"step": 655
},
{
"epoch": 1.2790640994394344,
"grad_norm": 0.43788987398147583,
"learning_rate": 3.015764222069911e-07,
"loss": 2.0629,
"step": 656
},
{
"epoch": 1.281013892273946,
"grad_norm": 0.4314157962799072,
"learning_rate": 3.012337217272104e-07,
"loss": 2.0554,
"step": 657
},
{
"epoch": 1.2829636851084572,
"grad_norm": 0.45381680130958557,
"learning_rate": 3.008910212474297e-07,
"loss": 2.0514,
"step": 658
},
{
"epoch": 1.2849134779429685,
"grad_norm": 0.47213441133499146,
"learning_rate": 3.005483207676491e-07,
"loss": 2.0267,
"step": 659
},
{
"epoch": 1.2868632707774799,
"grad_norm": 0.4460486173629761,
"learning_rate": 3.002056202878684e-07,
"loss": 2.0717,
"step": 660
},
{
"epoch": 1.2888130636119912,
"grad_norm": 0.452747642993927,
"learning_rate": 2.998629198080877e-07,
"loss": 2.0634,
"step": 661
},
{
"epoch": 1.2907628564465026,
"grad_norm": 0.4495120942592621,
"learning_rate": 2.995202193283071e-07,
"loss": 2.042,
"step": 662
},
{
"epoch": 1.292712649281014,
"grad_norm": 0.433224081993103,
"learning_rate": 2.991775188485264e-07,
"loss": 2.0565,
"step": 663
},
{
"epoch": 1.2946624421155253,
"grad_norm": 0.4596520960330963,
"learning_rate": 2.9883481836874567e-07,
"loss": 2.0272,
"step": 664
},
{
"epoch": 1.2966122349500366,
"grad_norm": 0.433887243270874,
"learning_rate": 2.9849211788896507e-07,
"loss": 1.965,
"step": 665
},
{
"epoch": 1.298562027784548,
"grad_norm": 0.44755810499191284,
"learning_rate": 2.9814941740918437e-07,
"loss": 1.9915,
"step": 666
},
{
"epoch": 1.3005118206190591,
"grad_norm": 0.48203861713409424,
"learning_rate": 2.9780671692940366e-07,
"loss": 2.0296,
"step": 667
},
{
"epoch": 1.3024616134535705,
"grad_norm": 0.4314959943294525,
"learning_rate": 2.97464016449623e-07,
"loss": 2.0282,
"step": 668
},
{
"epoch": 1.3044114062880818,
"grad_norm": 0.4476211369037628,
"learning_rate": 2.9712131596984236e-07,
"loss": 2.0348,
"step": 669
},
{
"epoch": 1.3063611991225932,
"grad_norm": 0.45356854796409607,
"learning_rate": 2.9677861549006165e-07,
"loss": 2.0369,
"step": 670
},
{
"epoch": 1.3083109919571045,
"grad_norm": 0.4637032747268677,
"learning_rate": 2.96435915010281e-07,
"loss": 2.1002,
"step": 671
},
{
"epoch": 1.310260784791616,
"grad_norm": 0.4258365333080292,
"learning_rate": 2.9609321453050035e-07,
"loss": 2.0184,
"step": 672
},
{
"epoch": 1.3122105776261273,
"grad_norm": 0.4571716785430908,
"learning_rate": 2.9575051405071964e-07,
"loss": 2.0711,
"step": 673
},
{
"epoch": 1.3141603704606386,
"grad_norm": 0.4479144215583801,
"learning_rate": 2.95407813570939e-07,
"loss": 2.1037,
"step": 674
},
{
"epoch": 1.31611016329515,
"grad_norm": 0.463773638010025,
"learning_rate": 2.9506511309115834e-07,
"loss": 2.087,
"step": 675
},
{
"epoch": 1.318059956129661,
"grad_norm": 0.4595959782600403,
"learning_rate": 2.9472241261137763e-07,
"loss": 2.0246,
"step": 676
},
{
"epoch": 1.3200097489641727,
"grad_norm": 0.41977226734161377,
"learning_rate": 2.94379712131597e-07,
"loss": 2.0132,
"step": 677
},
{
"epoch": 1.3219595417986838,
"grad_norm": 0.4429217576980591,
"learning_rate": 2.940370116518163e-07,
"loss": 2.0414,
"step": 678
},
{
"epoch": 1.3239093346331952,
"grad_norm": 0.46036285161972046,
"learning_rate": 2.936943111720356e-07,
"loss": 2.0474,
"step": 679
},
{
"epoch": 1.3258591274677065,
"grad_norm": 0.4518478512763977,
"learning_rate": 2.9335161069225497e-07,
"loss": 1.991,
"step": 680
},
{
"epoch": 1.3278089203022179,
"grad_norm": 0.4507528841495514,
"learning_rate": 2.9300891021247426e-07,
"loss": 2.0038,
"step": 681
},
{
"epoch": 1.3297587131367292,
"grad_norm": 0.45446595549583435,
"learning_rate": 2.926662097326936e-07,
"loss": 1.9257,
"step": 682
},
{
"epoch": 1.3317085059712406,
"grad_norm": 0.45073091983795166,
"learning_rate": 2.9232350925291296e-07,
"loss": 2.0667,
"step": 683
},
{
"epoch": 1.333658298805752,
"grad_norm": 0.43848779797554016,
"learning_rate": 2.9198080877313225e-07,
"loss": 2.0127,
"step": 684
},
{
"epoch": 1.3356080916402633,
"grad_norm": 0.44587504863739014,
"learning_rate": 2.916381082933516e-07,
"loss": 2.0694,
"step": 685
},
{
"epoch": 1.3375578844747746,
"grad_norm": 0.46157652139663696,
"learning_rate": 2.9129540781357095e-07,
"loss": 2.112,
"step": 686
},
{
"epoch": 1.3395076773092858,
"grad_norm": 0.461897075176239,
"learning_rate": 2.9095270733379024e-07,
"loss": 2.0431,
"step": 687
},
{
"epoch": 1.3414574701437973,
"grad_norm": 0.42506590485572815,
"learning_rate": 2.906100068540096e-07,
"loss": 2.0612,
"step": 688
},
{
"epoch": 1.3434072629783085,
"grad_norm": 0.43368127942085266,
"learning_rate": 2.9026730637422894e-07,
"loss": 2.0253,
"step": 689
},
{
"epoch": 1.3453570558128198,
"grad_norm": 0.4484082758426666,
"learning_rate": 2.8992460589444823e-07,
"loss": 1.9962,
"step": 690
},
{
"epoch": 1.3473068486473312,
"grad_norm": 0.44570791721343994,
"learning_rate": 2.895819054146676e-07,
"loss": 2.018,
"step": 691
},
{
"epoch": 1.3492566414818425,
"grad_norm": 0.4472144842147827,
"learning_rate": 2.892392049348869e-07,
"loss": 2.0254,
"step": 692
},
{
"epoch": 1.351206434316354,
"grad_norm": 0.4680030047893524,
"learning_rate": 2.888965044551062e-07,
"loss": 2.1265,
"step": 693
},
{
"epoch": 1.3531562271508653,
"grad_norm": 0.44323253631591797,
"learning_rate": 2.885538039753255e-07,
"loss": 2.0222,
"step": 694
},
{
"epoch": 1.3551060199853766,
"grad_norm": 0.4732964038848877,
"learning_rate": 2.882111034955449e-07,
"loss": 2.0219,
"step": 695
},
{
"epoch": 1.357055812819888,
"grad_norm": 0.4392209053039551,
"learning_rate": 2.878684030157642e-07,
"loss": 1.9841,
"step": 696
},
{
"epoch": 1.3590056056543993,
"grad_norm": 0.46177539229393005,
"learning_rate": 2.875257025359835e-07,
"loss": 2.0461,
"step": 697
},
{
"epoch": 1.3609553984889105,
"grad_norm": 0.4625999927520752,
"learning_rate": 2.871830020562029e-07,
"loss": 2.0137,
"step": 698
},
{
"epoch": 1.3629051913234218,
"grad_norm": 0.43552806973457336,
"learning_rate": 2.868403015764222e-07,
"loss": 2.0408,
"step": 699
},
{
"epoch": 1.3648549841579332,
"grad_norm": 0.47674480080604553,
"learning_rate": 2.864976010966415e-07,
"loss": 2.0021,
"step": 700
},
{
"epoch": 1.3668047769924445,
"grad_norm": 0.46479421854019165,
"learning_rate": 2.861549006168609e-07,
"loss": 1.9898,
"step": 701
},
{
"epoch": 1.3687545698269559,
"grad_norm": 0.4399622976779938,
"learning_rate": 2.858122001370802e-07,
"loss": 1.9638,
"step": 702
},
{
"epoch": 1.3707043626614672,
"grad_norm": 0.442557692527771,
"learning_rate": 2.854694996572995e-07,
"loss": 2.0099,
"step": 703
},
{
"epoch": 1.3726541554959786,
"grad_norm": 0.4601743817329407,
"learning_rate": 2.851267991775189e-07,
"loss": 2.057,
"step": 704
},
{
"epoch": 1.37460394833049,
"grad_norm": 0.4959220290184021,
"learning_rate": 2.847840986977382e-07,
"loss": 2.092,
"step": 705
},
{
"epoch": 1.3765537411650013,
"grad_norm": 0.40172404050827026,
"learning_rate": 2.8444139821795747e-07,
"loss": 2.0074,
"step": 706
},
{
"epoch": 1.3785035339995124,
"grad_norm": 0.4572814404964447,
"learning_rate": 2.840986977381768e-07,
"loss": 1.9777,
"step": 707
},
{
"epoch": 1.380453326834024,
"grad_norm": 0.4464624524116516,
"learning_rate": 2.8375599725839617e-07,
"loss": 2.0183,
"step": 708
},
{
"epoch": 1.3824031196685351,
"grad_norm": 0.4498922526836395,
"learning_rate": 2.8341329677861546e-07,
"loss": 2.0975,
"step": 709
},
{
"epoch": 1.3843529125030465,
"grad_norm": 0.4430985748767853,
"learning_rate": 2.830705962988348e-07,
"loss": 2.027,
"step": 710
},
{
"epoch": 1.3863027053375578,
"grad_norm": 0.4422641694545746,
"learning_rate": 2.8272789581905416e-07,
"loss": 2.0625,
"step": 711
},
{
"epoch": 1.3882524981720692,
"grad_norm": 0.46121206879615784,
"learning_rate": 2.8238519533927345e-07,
"loss": 2.0135,
"step": 712
},
{
"epoch": 1.3902022910065805,
"grad_norm": 0.4685353934764862,
"learning_rate": 2.820424948594928e-07,
"loss": 2.071,
"step": 713
},
{
"epoch": 1.392152083841092,
"grad_norm": 0.43733134865760803,
"learning_rate": 2.8169979437971215e-07,
"loss": 2.0531,
"step": 714
},
{
"epoch": 1.3941018766756033,
"grad_norm": 0.4479463994503021,
"learning_rate": 2.8135709389993144e-07,
"loss": 2.0192,
"step": 715
},
{
"epoch": 1.3960516695101146,
"grad_norm": 0.4477840065956116,
"learning_rate": 2.810143934201508e-07,
"loss": 2.0408,
"step": 716
},
{
"epoch": 1.398001462344626,
"grad_norm": 0.44232964515686035,
"learning_rate": 2.8067169294037014e-07,
"loss": 2.0992,
"step": 717
},
{
"epoch": 1.399951255179137,
"grad_norm": 0.4573095142841339,
"learning_rate": 2.8032899246058943e-07,
"loss": 1.9958,
"step": 718
},
{
"epoch": 1.4019010480136487,
"grad_norm": 0.4734794497489929,
"learning_rate": 2.799862919808088e-07,
"loss": 2.0268,
"step": 719
},
{
"epoch": 1.4038508408481598,
"grad_norm": 0.4753987193107605,
"learning_rate": 2.7964359150102807e-07,
"loss": 2.0436,
"step": 720
},
{
"epoch": 1.4058006336826712,
"grad_norm": 0.4515923261642456,
"learning_rate": 2.793008910212474e-07,
"loss": 2.0018,
"step": 721
},
{
"epoch": 1.4077504265171825,
"grad_norm": 0.45925289392471313,
"learning_rate": 2.7895819054146677e-07,
"loss": 2.0454,
"step": 722
},
{
"epoch": 1.4097002193516939,
"grad_norm": 0.4684261083602905,
"learning_rate": 2.7861549006168606e-07,
"loss": 2.0355,
"step": 723
},
{
"epoch": 1.4116500121862052,
"grad_norm": 0.4723130464553833,
"learning_rate": 2.782727895819054e-07,
"loss": 2.0189,
"step": 724
},
{
"epoch": 1.4135998050207166,
"grad_norm": 0.43946054577827454,
"learning_rate": 2.7793008910212476e-07,
"loss": 2.0165,
"step": 725
},
{
"epoch": 1.415549597855228,
"grad_norm": 0.45172879099845886,
"learning_rate": 2.7758738862234405e-07,
"loss": 1.9966,
"step": 726
},
{
"epoch": 1.4174993906897393,
"grad_norm": 0.4361145496368408,
"learning_rate": 2.772446881425634e-07,
"loss": 1.982,
"step": 727
},
{
"epoch": 1.4194491835242506,
"grad_norm": 0.4422454237937927,
"learning_rate": 2.7690198766278275e-07,
"loss": 2.0032,
"step": 728
},
{
"epoch": 1.4213989763587618,
"grad_norm": 0.4438495934009552,
"learning_rate": 2.7655928718300204e-07,
"loss": 2.0198,
"step": 729
},
{
"epoch": 1.4233487691932731,
"grad_norm": 0.4422749876976013,
"learning_rate": 2.762165867032214e-07,
"loss": 1.992,
"step": 730
},
{
"epoch": 1.4252985620277845,
"grad_norm": 0.4652174115180969,
"learning_rate": 2.7587388622344074e-07,
"loss": 2.0345,
"step": 731
},
{
"epoch": 1.4272483548622958,
"grad_norm": 0.46277597546577454,
"learning_rate": 2.7553118574366003e-07,
"loss": 2.0406,
"step": 732
},
{
"epoch": 1.4291981476968072,
"grad_norm": 0.45579442381858826,
"learning_rate": 2.751884852638793e-07,
"loss": 2.0671,
"step": 733
},
{
"epoch": 1.4311479405313186,
"grad_norm": 0.43527230620384216,
"learning_rate": 2.748457847840987e-07,
"loss": 2.0433,
"step": 734
},
{
"epoch": 1.43309773336583,
"grad_norm": 0.4699551463127136,
"learning_rate": 2.74503084304318e-07,
"loss": 2.0366,
"step": 735
},
{
"epoch": 1.4350475262003413,
"grad_norm": 0.4446089565753937,
"learning_rate": 2.741603838245373e-07,
"loss": 1.9986,
"step": 736
},
{
"epoch": 1.4369973190348526,
"grad_norm": 0.4645906686782837,
"learning_rate": 2.738176833447567e-07,
"loss": 2.1331,
"step": 737
},
{
"epoch": 1.4389471118693637,
"grad_norm": 0.46871501207351685,
"learning_rate": 2.73474982864976e-07,
"loss": 2.0402,
"step": 738
},
{
"epoch": 1.4408969047038753,
"grad_norm": 0.4507101774215698,
"learning_rate": 2.731322823851953e-07,
"loss": 2.0027,
"step": 739
},
{
"epoch": 1.4428466975383865,
"grad_norm": 0.4642309546470642,
"learning_rate": 2.727895819054147e-07,
"loss": 2.0613,
"step": 740
},
{
"epoch": 1.4447964903728978,
"grad_norm": 0.4762292206287384,
"learning_rate": 2.72446881425634e-07,
"loss": 2.0315,
"step": 741
},
{
"epoch": 1.4467462832074092,
"grad_norm": 0.4549463391304016,
"learning_rate": 2.721041809458533e-07,
"loss": 2.0492,
"step": 742
},
{
"epoch": 1.4486960760419205,
"grad_norm": 0.4566596448421478,
"learning_rate": 2.717614804660727e-07,
"loss": 1.9571,
"step": 743
},
{
"epoch": 1.4506458688764319,
"grad_norm": 0.4666212797164917,
"learning_rate": 2.71418779986292e-07,
"loss": 1.9897,
"step": 744
},
{
"epoch": 1.4525956617109432,
"grad_norm": 0.45651644468307495,
"learning_rate": 2.710760795065113e-07,
"loss": 2.0471,
"step": 745
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.43935099244117737,
"learning_rate": 2.707333790267306e-07,
"loss": 1.9525,
"step": 746
},
{
"epoch": 1.456495247379966,
"grad_norm": 0.4813799560070038,
"learning_rate": 2.7039067854695e-07,
"loss": 2.0396,
"step": 747
},
{
"epoch": 1.4584450402144773,
"grad_norm": 0.4743799567222595,
"learning_rate": 2.7004797806716927e-07,
"loss": 2.0824,
"step": 748
},
{
"epoch": 1.4603948330489884,
"grad_norm": 0.4927983283996582,
"learning_rate": 2.6970527758738857e-07,
"loss": 2.0257,
"step": 749
},
{
"epoch": 1.4623446258835,
"grad_norm": 0.4711035192012787,
"learning_rate": 2.6936257710760797e-07,
"loss": 2.0487,
"step": 750
},
{
"epoch": 1.4642944187180111,
"grad_norm": 0.4515864849090576,
"learning_rate": 2.6901987662782726e-07,
"loss": 2.0244,
"step": 751
},
{
"epoch": 1.4662442115525225,
"grad_norm": 0.46076542139053345,
"learning_rate": 2.6867717614804656e-07,
"loss": 2.07,
"step": 752
},
{
"epoch": 1.4681940043870338,
"grad_norm": 0.44762691855430603,
"learning_rate": 2.6833447566826596e-07,
"loss": 2.0297,
"step": 753
},
{
"epoch": 1.4701437972215452,
"grad_norm": 0.4801499843597412,
"learning_rate": 2.6799177518848525e-07,
"loss": 2.0683,
"step": 754
},
{
"epoch": 1.4720935900560566,
"grad_norm": 0.45053598284721375,
"learning_rate": 2.6764907470870455e-07,
"loss": 1.9783,
"step": 755
},
{
"epoch": 1.474043382890568,
"grad_norm": 0.45730066299438477,
"learning_rate": 2.673063742289239e-07,
"loss": 2.0548,
"step": 756
},
{
"epoch": 1.4759931757250793,
"grad_norm": 0.4543995261192322,
"learning_rate": 2.6696367374914324e-07,
"loss": 2.0306,
"step": 757
},
{
"epoch": 1.4779429685595906,
"grad_norm": 0.4372531473636627,
"learning_rate": 2.6662097326936254e-07,
"loss": 2.0164,
"step": 758
},
{
"epoch": 1.479892761394102,
"grad_norm": 0.44617414474487305,
"learning_rate": 2.662782727895819e-07,
"loss": 1.9891,
"step": 759
},
{
"epoch": 1.481842554228613,
"grad_norm": 0.4605617821216583,
"learning_rate": 2.6593557230980123e-07,
"loss": 2.01,
"step": 760
},
{
"epoch": 1.4837923470631245,
"grad_norm": 0.4638999402523041,
"learning_rate": 2.655928718300205e-07,
"loss": 2.0685,
"step": 761
},
{
"epoch": 1.4857421398976358,
"grad_norm": 0.4548538327217102,
"learning_rate": 2.6525017135023987e-07,
"loss": 2.0665,
"step": 762
},
{
"epoch": 1.4876919327321472,
"grad_norm": 0.44948044419288635,
"learning_rate": 2.649074708704592e-07,
"loss": 1.9921,
"step": 763
},
{
"epoch": 1.4896417255666585,
"grad_norm": 0.4577581286430359,
"learning_rate": 2.645647703906785e-07,
"loss": 2.0392,
"step": 764
},
{
"epoch": 1.4915915184011699,
"grad_norm": 0.4821256101131439,
"learning_rate": 2.6422206991089786e-07,
"loss": 2.1304,
"step": 765
},
{
"epoch": 1.4935413112356812,
"grad_norm": 0.48839786648750305,
"learning_rate": 2.638793694311172e-07,
"loss": 2.0773,
"step": 766
},
{
"epoch": 1.4954911040701926,
"grad_norm": 0.43702590465545654,
"learning_rate": 2.635366689513365e-07,
"loss": 2.02,
"step": 767
},
{
"epoch": 1.497440896904704,
"grad_norm": 0.45477136969566345,
"learning_rate": 2.6319396847155585e-07,
"loss": 1.9962,
"step": 768
},
{
"epoch": 1.499390689739215,
"grad_norm": 0.47229456901550293,
"learning_rate": 2.6285126799177515e-07,
"loss": 2.0281,
"step": 769
},
{
"epoch": 1.5013404825737267,
"grad_norm": 0.4817400276660919,
"learning_rate": 2.625085675119945e-07,
"loss": 2.1009,
"step": 770
},
{
"epoch": 1.5032902754082378,
"grad_norm": 0.4645569324493408,
"learning_rate": 2.6216586703221384e-07,
"loss": 2.083,
"step": 771
},
{
"epoch": 1.5052400682427494,
"grad_norm": 0.44810667634010315,
"learning_rate": 2.6182316655243314e-07,
"loss": 2.09,
"step": 772
},
{
"epoch": 1.5071898610772605,
"grad_norm": 0.44432902336120605,
"learning_rate": 2.614804660726525e-07,
"loss": 2.0126,
"step": 773
},
{
"epoch": 1.5091396539117719,
"grad_norm": 0.4630286991596222,
"learning_rate": 2.6113776559287183e-07,
"loss": 2.0136,
"step": 774
},
{
"epoch": 1.5110894467462832,
"grad_norm": 0.44443148374557495,
"learning_rate": 2.607950651130911e-07,
"loss": 1.9979,
"step": 775
},
{
"epoch": 1.5130392395807946,
"grad_norm": 0.44903403520584106,
"learning_rate": 2.6045236463331047e-07,
"loss": 1.9788,
"step": 776
},
{
"epoch": 1.514989032415306,
"grad_norm": 0.45394134521484375,
"learning_rate": 2.601096641535298e-07,
"loss": 1.9529,
"step": 777
},
{
"epoch": 1.516938825249817,
"grad_norm": 0.46713778376579285,
"learning_rate": 2.597669636737491e-07,
"loss": 2.0212,
"step": 778
},
{
"epoch": 1.5188886180843286,
"grad_norm": 0.45262840390205383,
"learning_rate": 2.5942426319396846e-07,
"loss": 2.0723,
"step": 779
},
{
"epoch": 1.5208384109188398,
"grad_norm": 0.4648626446723938,
"learning_rate": 2.590815627141878e-07,
"loss": 2.0046,
"step": 780
},
{
"epoch": 1.5227882037533513,
"grad_norm": 0.4754423201084137,
"learning_rate": 2.587388622344071e-07,
"loss": 2.0434,
"step": 781
},
{
"epoch": 1.5247379965878625,
"grad_norm": 0.4271760880947113,
"learning_rate": 2.583961617546264e-07,
"loss": 2.0843,
"step": 782
},
{
"epoch": 1.5266877894223738,
"grad_norm": 0.48139727115631104,
"learning_rate": 2.580534612748458e-07,
"loss": 2.098,
"step": 783
},
{
"epoch": 1.5286375822568852,
"grad_norm": 0.473366379737854,
"learning_rate": 2.577107607950651e-07,
"loss": 2.0422,
"step": 784
},
{
"epoch": 1.5305873750913965,
"grad_norm": 0.4580918848514557,
"learning_rate": 2.573680603152844e-07,
"loss": 2.006,
"step": 785
},
{
"epoch": 1.5325371679259079,
"grad_norm": 0.4635441303253174,
"learning_rate": 2.570253598355038e-07,
"loss": 1.9736,
"step": 786
},
{
"epoch": 1.5344869607604192,
"grad_norm": 0.4621422290802002,
"learning_rate": 2.566826593557231e-07,
"loss": 2.1078,
"step": 787
},
{
"epoch": 1.5364367535949306,
"grad_norm": 0.4151935279369354,
"learning_rate": 2.563399588759424e-07,
"loss": 2.0092,
"step": 788
},
{
"epoch": 1.5383865464294417,
"grad_norm": 0.4793336093425751,
"learning_rate": 2.559972583961618e-07,
"loss": 2.0173,
"step": 789
},
{
"epoch": 1.5403363392639533,
"grad_norm": 0.4768364429473877,
"learning_rate": 2.5565455791638107e-07,
"loss": 2.0813,
"step": 790
},
{
"epoch": 1.5422861320984644,
"grad_norm": 0.452411949634552,
"learning_rate": 2.5531185743660037e-07,
"loss": 2.0527,
"step": 791
},
{
"epoch": 1.544235924932976,
"grad_norm": 0.44334676861763,
"learning_rate": 2.5496915695681977e-07,
"loss": 1.9701,
"step": 792
},
{
"epoch": 1.5461857177674871,
"grad_norm": 0.4465942978858948,
"learning_rate": 2.5462645647703906e-07,
"loss": 1.9905,
"step": 793
},
{
"epoch": 1.5481355106019985,
"grad_norm": 0.4681743085384369,
"learning_rate": 2.5428375599725836e-07,
"loss": 2.0654,
"step": 794
},
{
"epoch": 1.5500853034365099,
"grad_norm": 0.46780961751937866,
"learning_rate": 2.539410555174777e-07,
"loss": 2.0336,
"step": 795
},
{
"epoch": 1.5520350962710212,
"grad_norm": 0.44133254885673523,
"learning_rate": 2.5359835503769705e-07,
"loss": 1.9668,
"step": 796
},
{
"epoch": 1.5539848891055326,
"grad_norm": 0.45011645555496216,
"learning_rate": 2.5325565455791635e-07,
"loss": 2.0099,
"step": 797
},
{
"epoch": 1.555934681940044,
"grad_norm": 0.41162246465682983,
"learning_rate": 2.529129540781357e-07,
"loss": 1.9684,
"step": 798
},
{
"epoch": 1.5578844747745553,
"grad_norm": 0.438760906457901,
"learning_rate": 2.5257025359835504e-07,
"loss": 1.9934,
"step": 799
},
{
"epoch": 1.5598342676090664,
"grad_norm": 0.45921608805656433,
"learning_rate": 2.5222755311857434e-07,
"loss": 2.0447,
"step": 800
},
{
"epoch": 1.561784060443578,
"grad_norm": 0.4474433958530426,
"learning_rate": 2.518848526387937e-07,
"loss": 2.0508,
"step": 801
},
{
"epoch": 1.5637338532780891,
"grad_norm": 0.42901015281677246,
"learning_rate": 2.5154215215901303e-07,
"loss": 2.0607,
"step": 802
},
{
"epoch": 1.5656836461126007,
"grad_norm": 0.4604319632053375,
"learning_rate": 2.511994516792323e-07,
"loss": 2.0142,
"step": 803
},
{
"epoch": 1.5676334389471118,
"grad_norm": 0.4305102527141571,
"learning_rate": 2.5085675119945167e-07,
"loss": 1.9828,
"step": 804
},
{
"epoch": 1.5695832317816232,
"grad_norm": 0.4656990170478821,
"learning_rate": 2.50514050719671e-07,
"loss": 2.0302,
"step": 805
},
{
"epoch": 1.5715330246161345,
"grad_norm": 0.4602496325969696,
"learning_rate": 2.501713502398903e-07,
"loss": 2.0412,
"step": 806
},
{
"epoch": 1.5734828174506459,
"grad_norm": 0.4626891314983368,
"learning_rate": 2.4982864976010966e-07,
"loss": 2.0513,
"step": 807
},
{
"epoch": 1.5754326102851572,
"grad_norm": 0.4671951234340668,
"learning_rate": 2.4948594928032896e-07,
"loss": 2.003,
"step": 808
},
{
"epoch": 1.5773824031196684,
"grad_norm": 0.4399751126766205,
"learning_rate": 2.491432488005483e-07,
"loss": 2.0532,
"step": 809
},
{
"epoch": 1.57933219595418,
"grad_norm": 0.4228038191795349,
"learning_rate": 2.4880054832076765e-07,
"loss": 2.0078,
"step": 810
},
{
"epoch": 1.581281988788691,
"grad_norm": 0.4445479214191437,
"learning_rate": 2.4845784784098695e-07,
"loss": 2.0142,
"step": 811
},
{
"epoch": 1.5832317816232027,
"grad_norm": 0.4397488534450531,
"learning_rate": 2.481151473612063e-07,
"loss": 2.0468,
"step": 812
},
{
"epoch": 1.5851815744577138,
"grad_norm": 0.48187440633773804,
"learning_rate": 2.4777244688142564e-07,
"loss": 2.0444,
"step": 813
},
{
"epoch": 1.5871313672922251,
"grad_norm": 0.4355807304382324,
"learning_rate": 2.4742974640164494e-07,
"loss": 1.9955,
"step": 814
},
{
"epoch": 1.5890811601267365,
"grad_norm": 0.4219972491264343,
"learning_rate": 2.470870459218643e-07,
"loss": 1.9971,
"step": 815
},
{
"epoch": 1.5910309529612479,
"grad_norm": 0.44700267910957336,
"learning_rate": 2.4674434544208363e-07,
"loss": 2.0297,
"step": 816
},
{
"epoch": 1.5929807457957592,
"grad_norm": 0.45433923602104187,
"learning_rate": 2.464016449623029e-07,
"loss": 2.0064,
"step": 817
},
{
"epoch": 1.5949305386302706,
"grad_norm": 0.4188825488090515,
"learning_rate": 2.4605894448252227e-07,
"loss": 2.0236,
"step": 818
},
{
"epoch": 1.596880331464782,
"grad_norm": 0.4635048508644104,
"learning_rate": 2.457162440027416e-07,
"loss": 2.0652,
"step": 819
},
{
"epoch": 1.598830124299293,
"grad_norm": 0.4555036127567291,
"learning_rate": 2.453735435229609e-07,
"loss": 2.079,
"step": 820
},
{
"epoch": 1.6007799171338046,
"grad_norm": 0.45152541995048523,
"learning_rate": 2.4503084304318026e-07,
"loss": 1.9724,
"step": 821
},
{
"epoch": 1.6027297099683158,
"grad_norm": 0.4355667233467102,
"learning_rate": 2.446881425633996e-07,
"loss": 2.0444,
"step": 822
},
{
"epoch": 1.6046795028028273,
"grad_norm": 0.42853429913520813,
"learning_rate": 2.443454420836189e-07,
"loss": 1.9451,
"step": 823
},
{
"epoch": 1.6066292956373385,
"grad_norm": 0.4546351134777069,
"learning_rate": 2.4400274160383825e-07,
"loss": 2.015,
"step": 824
},
{
"epoch": 1.6085790884718498,
"grad_norm": 0.45015424489974976,
"learning_rate": 2.4366004112405755e-07,
"loss": 2.0171,
"step": 825
},
{
"epoch": 1.6105288813063612,
"grad_norm": 0.446065217256546,
"learning_rate": 2.433173406442769e-07,
"loss": 2.0085,
"step": 826
},
{
"epoch": 1.6124786741408725,
"grad_norm": 0.46771183609962463,
"learning_rate": 2.4297464016449624e-07,
"loss": 1.9844,
"step": 827
},
{
"epoch": 1.614428466975384,
"grad_norm": 0.4590853452682495,
"learning_rate": 2.4263193968471554e-07,
"loss": 2.0031,
"step": 828
},
{
"epoch": 1.6163782598098952,
"grad_norm": 0.4465842545032501,
"learning_rate": 2.422892392049349e-07,
"loss": 2.0344,
"step": 829
},
{
"epoch": 1.6183280526444066,
"grad_norm": 0.40251830220222473,
"learning_rate": 2.419465387251542e-07,
"loss": 2.0129,
"step": 830
},
{
"epoch": 1.6202778454789177,
"grad_norm": 0.45284631848335266,
"learning_rate": 2.416038382453735e-07,
"loss": 2.0354,
"step": 831
},
{
"epoch": 1.6222276383134293,
"grad_norm": 0.4733079969882965,
"learning_rate": 2.4126113776559287e-07,
"loss": 1.993,
"step": 832
},
{
"epoch": 1.6241774311479404,
"grad_norm": 0.4264031946659088,
"learning_rate": 2.4091843728581217e-07,
"loss": 2.007,
"step": 833
},
{
"epoch": 1.626127223982452,
"grad_norm": 0.46400555968284607,
"learning_rate": 2.405757368060315e-07,
"loss": 1.9825,
"step": 834
},
{
"epoch": 1.6280770168169632,
"grad_norm": 0.4408418834209442,
"learning_rate": 2.4023303632625086e-07,
"loss": 2.0199,
"step": 835
},
{
"epoch": 1.6300268096514745,
"grad_norm": 0.4353219270706177,
"learning_rate": 2.3989033584647016e-07,
"loss": 1.9767,
"step": 836
},
{
"epoch": 1.6319766024859859,
"grad_norm": 0.47256654500961304,
"learning_rate": 2.395476353666895e-07,
"loss": 2.0708,
"step": 837
},
{
"epoch": 1.6339263953204972,
"grad_norm": 0.44208547472953796,
"learning_rate": 2.392049348869088e-07,
"loss": 2.0518,
"step": 838
},
{
"epoch": 1.6358761881550086,
"grad_norm": 0.4937672019004822,
"learning_rate": 2.3886223440712815e-07,
"loss": 2.043,
"step": 839
},
{
"epoch": 1.6378259809895197,
"grad_norm": 0.46095776557922363,
"learning_rate": 2.385195339273475e-07,
"loss": 2.0421,
"step": 840
},
{
"epoch": 1.6397757738240313,
"grad_norm": 0.4658643901348114,
"learning_rate": 2.3817683344756682e-07,
"loss": 2.0225,
"step": 841
},
{
"epoch": 1.6417255666585424,
"grad_norm": 0.4451207220554352,
"learning_rate": 2.3783413296778616e-07,
"loss": 2.0244,
"step": 842
},
{
"epoch": 1.643675359493054,
"grad_norm": 0.43841567635536194,
"learning_rate": 2.3749143248800546e-07,
"loss": 1.9797,
"step": 843
},
{
"epoch": 1.6456251523275651,
"grad_norm": 0.45495790243148804,
"learning_rate": 2.371487320082248e-07,
"loss": 2.039,
"step": 844
},
{
"epoch": 1.6475749451620765,
"grad_norm": 0.4694961607456207,
"learning_rate": 2.3680603152844415e-07,
"loss": 2.0232,
"step": 845
},
{
"epoch": 1.6495247379965878,
"grad_norm": 0.4593546986579895,
"learning_rate": 2.3646333104866345e-07,
"loss": 2.0495,
"step": 846
},
{
"epoch": 1.6514745308310992,
"grad_norm": 0.4738862216472626,
"learning_rate": 2.361206305688828e-07,
"loss": 2.0105,
"step": 847
},
{
"epoch": 1.6534243236656105,
"grad_norm": 0.45088139176368713,
"learning_rate": 2.357779300891021e-07,
"loss": 2.0418,
"step": 848
},
{
"epoch": 1.655374116500122,
"grad_norm": 0.4501790702342987,
"learning_rate": 2.3543522960932144e-07,
"loss": 2.0531,
"step": 849
},
{
"epoch": 1.6573239093346332,
"grad_norm": 0.47187909483909607,
"learning_rate": 2.3509252912954078e-07,
"loss": 1.9907,
"step": 850
},
{
"epoch": 1.6592737021691444,
"grad_norm": 0.46769675612449646,
"learning_rate": 2.3474982864976008e-07,
"loss": 2.0145,
"step": 851
},
{
"epoch": 1.661223495003656,
"grad_norm": 0.44854676723480225,
"learning_rate": 2.3440712816997943e-07,
"loss": 2.0381,
"step": 852
},
{
"epoch": 1.663173287838167,
"grad_norm": 0.4576641023159027,
"learning_rate": 2.3406442769019877e-07,
"loss": 1.9722,
"step": 853
},
{
"epoch": 1.6651230806726787,
"grad_norm": 0.4568294584751129,
"learning_rate": 2.3372172721041807e-07,
"loss": 1.9744,
"step": 854
},
{
"epoch": 1.6670728735071898,
"grad_norm": 0.4591883718967438,
"learning_rate": 2.3337902673063742e-07,
"loss": 1.9666,
"step": 855
},
{
"epoch": 1.6690226663417012,
"grad_norm": 0.44672197103500366,
"learning_rate": 2.3303632625085674e-07,
"loss": 1.9944,
"step": 856
},
{
"epoch": 1.6709724591762125,
"grad_norm": 0.4896506667137146,
"learning_rate": 2.3269362577107606e-07,
"loss": 2.0492,
"step": 857
},
{
"epoch": 1.6729222520107239,
"grad_norm": 0.4453061521053314,
"learning_rate": 2.323509252912954e-07,
"loss": 1.9757,
"step": 858
},
{
"epoch": 1.6748720448452352,
"grad_norm": 0.4569021761417389,
"learning_rate": 2.3200822481151473e-07,
"loss": 2.0523,
"step": 859
},
{
"epoch": 1.6768218376797466,
"grad_norm": 0.4553905427455902,
"learning_rate": 2.3166552433173405e-07,
"loss": 2.0189,
"step": 860
},
{
"epoch": 1.678771630514258,
"grad_norm": 0.4560829699039459,
"learning_rate": 2.3132282385195337e-07,
"loss": 2.0833,
"step": 861
},
{
"epoch": 1.680721423348769,
"grad_norm": 0.4487151503562927,
"learning_rate": 2.3098012337217272e-07,
"loss": 1.9806,
"step": 862
},
{
"epoch": 1.6826712161832806,
"grad_norm": 0.440891832113266,
"learning_rate": 2.3063742289239204e-07,
"loss": 1.9989,
"step": 863
},
{
"epoch": 1.6846210090177918,
"grad_norm": 0.469881534576416,
"learning_rate": 2.3029472241261136e-07,
"loss": 2.0626,
"step": 864
},
{
"epoch": 1.6865708018523033,
"grad_norm": 0.43621349334716797,
"learning_rate": 2.299520219328307e-07,
"loss": 2.063,
"step": 865
},
{
"epoch": 1.6885205946868145,
"grad_norm": 0.45750436186790466,
"learning_rate": 2.2960932145305003e-07,
"loss": 2.0164,
"step": 866
},
{
"epoch": 1.6904703875213258,
"grad_norm": 0.46832090616226196,
"learning_rate": 2.2926662097326935e-07,
"loss": 2.0459,
"step": 867
},
{
"epoch": 1.6924201803558372,
"grad_norm": 0.4424852728843689,
"learning_rate": 2.289239204934887e-07,
"loss": 2.0148,
"step": 868
},
{
"epoch": 1.6943699731903485,
"grad_norm": 0.4639265239238739,
"learning_rate": 2.28581220013708e-07,
"loss": 2.0453,
"step": 869
},
{
"epoch": 1.69631976602486,
"grad_norm": 0.42720574140548706,
"learning_rate": 2.2823851953392734e-07,
"loss": 2.0164,
"step": 870
},
{
"epoch": 1.698269558859371,
"grad_norm": 0.46615973114967346,
"learning_rate": 2.2789581905414668e-07,
"loss": 2.0235,
"step": 871
},
{
"epoch": 1.7002193516938826,
"grad_norm": 0.46956273913383484,
"learning_rate": 2.2755311857436598e-07,
"loss": 2.0668,
"step": 872
},
{
"epoch": 1.7021691445283937,
"grad_norm": 0.45590096712112427,
"learning_rate": 2.2721041809458533e-07,
"loss": 2.0767,
"step": 873
},
{
"epoch": 1.7041189373629053,
"grad_norm": 0.4419032037258148,
"learning_rate": 2.2686771761480465e-07,
"loss": 2.0298,
"step": 874
},
{
"epoch": 1.7060687301974164,
"grad_norm": 0.48438993096351624,
"learning_rate": 2.2652501713502397e-07,
"loss": 2.0881,
"step": 875
},
{
"epoch": 1.7080185230319278,
"grad_norm": 0.4674246609210968,
"learning_rate": 2.2618231665524332e-07,
"loss": 1.9858,
"step": 876
},
{
"epoch": 1.7099683158664392,
"grad_norm": 0.4731968641281128,
"learning_rate": 2.2583961617546264e-07,
"loss": 2.0684,
"step": 877
},
{
"epoch": 1.7119181087009505,
"grad_norm": 0.44370540976524353,
"learning_rate": 2.2549691569568196e-07,
"loss": 2.0222,
"step": 878
},
{
"epoch": 1.7138679015354619,
"grad_norm": 0.43057727813720703,
"learning_rate": 2.251542152159013e-07,
"loss": 2.0054,
"step": 879
},
{
"epoch": 1.7158176943699732,
"grad_norm": 0.4575825035572052,
"learning_rate": 2.2481151473612063e-07,
"loss": 2.0194,
"step": 880
},
{
"epoch": 1.7177674872044846,
"grad_norm": 0.46100616455078125,
"learning_rate": 2.2446881425633995e-07,
"loss": 2.0362,
"step": 881
},
{
"epoch": 1.7197172800389957,
"grad_norm": 0.46780040860176086,
"learning_rate": 2.2412611377655927e-07,
"loss": 2.0458,
"step": 882
},
{
"epoch": 1.7216670728735073,
"grad_norm": 0.4316709339618683,
"learning_rate": 2.2378341329677862e-07,
"loss": 2.0401,
"step": 883
},
{
"epoch": 1.7236168657080184,
"grad_norm": 0.43883568048477173,
"learning_rate": 2.2344071281699794e-07,
"loss": 2.0407,
"step": 884
},
{
"epoch": 1.72556665854253,
"grad_norm": 0.44989317655563354,
"learning_rate": 2.2309801233721726e-07,
"loss": 2.0253,
"step": 885
},
{
"epoch": 1.7275164513770411,
"grad_norm": 0.4468737840652466,
"learning_rate": 2.227553118574366e-07,
"loss": 2.0336,
"step": 886
},
{
"epoch": 1.7294662442115525,
"grad_norm": 0.45126405358314514,
"learning_rate": 2.224126113776559e-07,
"loss": 2.0259,
"step": 887
},
{
"epoch": 1.7314160370460638,
"grad_norm": 0.43270209431648254,
"learning_rate": 2.2206991089787525e-07,
"loss": 2.0071,
"step": 888
},
{
"epoch": 1.7333658298805752,
"grad_norm": 0.4503726363182068,
"learning_rate": 2.217272104180946e-07,
"loss": 2.1025,
"step": 889
},
{
"epoch": 1.7353156227150865,
"grad_norm": 0.44900792837142944,
"learning_rate": 2.213845099383139e-07,
"loss": 1.9883,
"step": 890
},
{
"epoch": 1.737265415549598,
"grad_norm": 0.4531221091747284,
"learning_rate": 2.2104180945853324e-07,
"loss": 2.0095,
"step": 891
},
{
"epoch": 1.7392152083841093,
"grad_norm": 0.46359124779701233,
"learning_rate": 2.2069910897875258e-07,
"loss": 2.003,
"step": 892
},
{
"epoch": 1.7411650012186204,
"grad_norm": 0.4506163001060486,
"learning_rate": 2.2035640849897188e-07,
"loss": 1.9438,
"step": 893
},
{
"epoch": 1.743114794053132,
"grad_norm": 0.4618943929672241,
"learning_rate": 2.2001370801919123e-07,
"loss": 2.0772,
"step": 894
},
{
"epoch": 1.745064586887643,
"grad_norm": 0.4341379404067993,
"learning_rate": 2.1967100753941055e-07,
"loss": 1.9443,
"step": 895
},
{
"epoch": 1.7470143797221547,
"grad_norm": 0.4800126254558563,
"learning_rate": 2.1932830705962987e-07,
"loss": 1.9994,
"step": 896
},
{
"epoch": 1.7489641725566658,
"grad_norm": 0.45474764704704285,
"learning_rate": 2.1898560657984922e-07,
"loss": 2.0635,
"step": 897
},
{
"epoch": 1.7509139653911772,
"grad_norm": 0.44301092624664307,
"learning_rate": 2.1864290610006854e-07,
"loss": 1.9752,
"step": 898
},
{
"epoch": 1.7528637582256885,
"grad_norm": 0.4428479075431824,
"learning_rate": 2.1830020562028786e-07,
"loss": 1.9371,
"step": 899
},
{
"epoch": 1.7548135510601999,
"grad_norm": 0.4576126039028168,
"learning_rate": 2.1795750514050718e-07,
"loss": 2.063,
"step": 900
},
{
"epoch": 1.7567633438947112,
"grad_norm": 0.47722387313842773,
"learning_rate": 2.1761480466072653e-07,
"loss": 2.0743,
"step": 901
},
{
"epoch": 1.7587131367292224,
"grad_norm": 0.4575481712818146,
"learning_rate": 2.1727210418094585e-07,
"loss": 1.9873,
"step": 902
},
{
"epoch": 1.760662929563734,
"grad_norm": 0.4340214729309082,
"learning_rate": 2.1692940370116517e-07,
"loss": 1.9459,
"step": 903
},
{
"epoch": 1.762612722398245,
"grad_norm": 0.41616639494895935,
"learning_rate": 2.1658670322138452e-07,
"loss": 1.9505,
"step": 904
},
{
"epoch": 1.7645625152327566,
"grad_norm": 0.472650408744812,
"learning_rate": 2.162440027416038e-07,
"loss": 2.0594,
"step": 905
},
{
"epoch": 1.7665123080672678,
"grad_norm": 0.4756447374820709,
"learning_rate": 2.1590130226182316e-07,
"loss": 1.9695,
"step": 906
},
{
"epoch": 1.7684621009017791,
"grad_norm": 0.44738152623176575,
"learning_rate": 2.155586017820425e-07,
"loss": 2.0771,
"step": 907
},
{
"epoch": 1.7704118937362905,
"grad_norm": 0.4602157771587372,
"learning_rate": 2.152159013022618e-07,
"loss": 2.0813,
"step": 908
},
{
"epoch": 1.7723616865708018,
"grad_norm": 0.46765050292015076,
"learning_rate": 2.1487320082248115e-07,
"loss": 2.0801,
"step": 909
},
{
"epoch": 1.7743114794053132,
"grad_norm": 0.4703747034072876,
"learning_rate": 2.145305003427005e-07,
"loss": 2.0093,
"step": 910
},
{
"epoch": 1.7762612722398246,
"grad_norm": 0.48457059264183044,
"learning_rate": 2.141877998629198e-07,
"loss": 2.0528,
"step": 911
},
{
"epoch": 1.778211065074336,
"grad_norm": 0.478710412979126,
"learning_rate": 2.1384509938313914e-07,
"loss": 2.1099,
"step": 912
},
{
"epoch": 1.780160857908847,
"grad_norm": 0.4458109438419342,
"learning_rate": 2.1350239890335843e-07,
"loss": 2.0592,
"step": 913
},
{
"epoch": 1.7821106507433586,
"grad_norm": 0.4474625885486603,
"learning_rate": 2.1315969842357778e-07,
"loss": 2.0055,
"step": 914
},
{
"epoch": 1.7840604435778697,
"grad_norm": 0.4586813151836395,
"learning_rate": 2.1281699794379713e-07,
"loss": 2.0131,
"step": 915
},
{
"epoch": 1.7860102364123813,
"grad_norm": 0.45083218812942505,
"learning_rate": 2.1247429746401642e-07,
"loss": 2.0437,
"step": 916
},
{
"epoch": 1.7879600292468925,
"grad_norm": 0.44078171253204346,
"learning_rate": 2.1213159698423577e-07,
"loss": 1.9792,
"step": 917
},
{
"epoch": 1.7899098220814038,
"grad_norm": 0.4346940219402313,
"learning_rate": 2.117888965044551e-07,
"loss": 1.9933,
"step": 918
},
{
"epoch": 1.7918596149159152,
"grad_norm": 0.45846906304359436,
"learning_rate": 2.114461960246744e-07,
"loss": 1.9682,
"step": 919
},
{
"epoch": 1.7938094077504265,
"grad_norm": 0.4335155785083771,
"learning_rate": 2.1110349554489376e-07,
"loss": 2.03,
"step": 920
},
{
"epoch": 1.7957592005849379,
"grad_norm": 0.4618023633956909,
"learning_rate": 2.1076079506511308e-07,
"loss": 2.0966,
"step": 921
},
{
"epoch": 1.7977089934194492,
"grad_norm": 0.46044906973838806,
"learning_rate": 2.104180945853324e-07,
"loss": 2.0873,
"step": 922
},
{
"epoch": 1.7996587862539606,
"grad_norm": 0.4635170102119446,
"learning_rate": 2.1007539410555175e-07,
"loss": 1.9897,
"step": 923
},
{
"epoch": 1.8016085790884717,
"grad_norm": 0.4335494637489319,
"learning_rate": 2.0973269362577107e-07,
"loss": 2.0228,
"step": 924
},
{
"epoch": 1.8035583719229833,
"grad_norm": 0.44605642557144165,
"learning_rate": 2.093899931459904e-07,
"loss": 2.0561,
"step": 925
},
{
"epoch": 1.8055081647574944,
"grad_norm": 0.4611765146255493,
"learning_rate": 2.090472926662097e-07,
"loss": 2.0329,
"step": 926
},
{
"epoch": 1.807457957592006,
"grad_norm": 0.443036288022995,
"learning_rate": 2.0870459218642906e-07,
"loss": 1.9565,
"step": 927
},
{
"epoch": 1.8094077504265171,
"grad_norm": 0.4552265405654907,
"learning_rate": 2.0836189170664838e-07,
"loss": 2.0842,
"step": 928
},
{
"epoch": 1.8113575432610285,
"grad_norm": 0.41511160135269165,
"learning_rate": 2.080191912268677e-07,
"loss": 2.0043,
"step": 929
},
{
"epoch": 1.8133073360955398,
"grad_norm": 0.44421470165252686,
"learning_rate": 2.0767649074708705e-07,
"loss": 2.0433,
"step": 930
},
{
"epoch": 1.8152571289300512,
"grad_norm": 0.43709036707878113,
"learning_rate": 2.0733379026730634e-07,
"loss": 2.0405,
"step": 931
},
{
"epoch": 1.8172069217645626,
"grad_norm": 0.429074227809906,
"learning_rate": 2.069910897875257e-07,
"loss": 1.964,
"step": 932
},
{
"epoch": 1.8191567145990737,
"grad_norm": 0.4392930269241333,
"learning_rate": 2.0664838930774504e-07,
"loss": 1.9819,
"step": 933
},
{
"epoch": 1.8211065074335853,
"grad_norm": 0.41590166091918945,
"learning_rate": 2.0630568882796433e-07,
"loss": 1.9821,
"step": 934
},
{
"epoch": 1.8230563002680964,
"grad_norm": 0.445362389087677,
"learning_rate": 2.0596298834818368e-07,
"loss": 2.092,
"step": 935
},
{
"epoch": 1.825006093102608,
"grad_norm": 0.43674713373184204,
"learning_rate": 2.0562028786840303e-07,
"loss": 2.0371,
"step": 936
},
{
"epoch": 1.826955885937119,
"grad_norm": 0.4520663022994995,
"learning_rate": 2.0527758738862232e-07,
"loss": 2.0329,
"step": 937
},
{
"epoch": 1.8289056787716305,
"grad_norm": 0.4744395613670349,
"learning_rate": 2.0493488690884167e-07,
"loss": 2.0828,
"step": 938
},
{
"epoch": 1.8308554716061418,
"grad_norm": 0.45714208483695984,
"learning_rate": 2.04592186429061e-07,
"loss": 2.017,
"step": 939
},
{
"epoch": 1.8328052644406532,
"grad_norm": 0.4604392647743225,
"learning_rate": 2.042494859492803e-07,
"loss": 1.9813,
"step": 940
},
{
"epoch": 1.8347550572751645,
"grad_norm": 0.43890222907066345,
"learning_rate": 2.0390678546949966e-07,
"loss": 1.9902,
"step": 941
},
{
"epoch": 1.8367048501096759,
"grad_norm": 0.44383513927459717,
"learning_rate": 2.0356408498971898e-07,
"loss": 2.0434,
"step": 942
},
{
"epoch": 1.8386546429441872,
"grad_norm": 0.43706512451171875,
"learning_rate": 2.032213845099383e-07,
"loss": 2.052,
"step": 943
},
{
"epoch": 1.8406044357786984,
"grad_norm": 0.427843302488327,
"learning_rate": 2.0287868403015762e-07,
"loss": 1.8841,
"step": 944
},
{
"epoch": 1.84255422861321,
"grad_norm": 0.4639602601528168,
"learning_rate": 2.0253598355037697e-07,
"loss": 2.0831,
"step": 945
},
{
"epoch": 1.844504021447721,
"grad_norm": 0.44139614701271057,
"learning_rate": 2.021932830705963e-07,
"loss": 1.9867,
"step": 946
},
{
"epoch": 1.8464538142822327,
"grad_norm": 0.4408351182937622,
"learning_rate": 2.018505825908156e-07,
"loss": 2.0199,
"step": 947
},
{
"epoch": 1.8484036071167438,
"grad_norm": 0.49647897481918335,
"learning_rate": 2.0150788211103496e-07,
"loss": 2.0877,
"step": 948
},
{
"epoch": 1.8503533999512551,
"grad_norm": 0.46033725142478943,
"learning_rate": 2.0116518163125428e-07,
"loss": 2.0584,
"step": 949
},
{
"epoch": 1.8523031927857665,
"grad_norm": 0.4471881687641144,
"learning_rate": 2.008224811514736e-07,
"loss": 1.9694,
"step": 950
},
{
"epoch": 1.8542529856202778,
"grad_norm": 0.435660183429718,
"learning_rate": 2.0047978067169295e-07,
"loss": 2.0025,
"step": 951
},
{
"epoch": 1.8562027784547892,
"grad_norm": 0.4504587650299072,
"learning_rate": 2.0013708019191224e-07,
"loss": 2.0403,
"step": 952
},
{
"epoch": 1.8581525712893006,
"grad_norm": 0.446451336145401,
"learning_rate": 1.997943797121316e-07,
"loss": 1.9817,
"step": 953
},
{
"epoch": 1.860102364123812,
"grad_norm": 0.46191105246543884,
"learning_rate": 1.9945167923235094e-07,
"loss": 2.0329,
"step": 954
},
{
"epoch": 1.862052156958323,
"grad_norm": 0.4477747976779938,
"learning_rate": 1.9910897875257023e-07,
"loss": 2.0113,
"step": 955
},
{
"epoch": 1.8640019497928346,
"grad_norm": 0.46400219202041626,
"learning_rate": 1.9876627827278958e-07,
"loss": 2.0142,
"step": 956
},
{
"epoch": 1.8659517426273458,
"grad_norm": 0.45763564109802246,
"learning_rate": 1.984235777930089e-07,
"loss": 2.0555,
"step": 957
},
{
"epoch": 1.8679015354618573,
"grad_norm": 0.4603627920150757,
"learning_rate": 1.9808087731322822e-07,
"loss": 2.0022,
"step": 958
},
{
"epoch": 1.8698513282963685,
"grad_norm": 0.5134696364402771,
"learning_rate": 1.9773817683344757e-07,
"loss": 2.0396,
"step": 959
},
{
"epoch": 1.8718011211308798,
"grad_norm": 0.46097123622894287,
"learning_rate": 1.973954763536669e-07,
"loss": 2.0887,
"step": 960
},
{
"epoch": 1.8737509139653912,
"grad_norm": 0.45269545912742615,
"learning_rate": 1.970527758738862e-07,
"loss": 2.0184,
"step": 961
},
{
"epoch": 1.8757007067999025,
"grad_norm": 0.463885635137558,
"learning_rate": 1.9671007539410553e-07,
"loss": 2.0701,
"step": 962
},
{
"epoch": 1.8776504996344139,
"grad_norm": 0.4765574634075165,
"learning_rate": 1.9636737491432488e-07,
"loss": 1.9951,
"step": 963
},
{
"epoch": 1.879600292468925,
"grad_norm": 0.48183631896972656,
"learning_rate": 1.960246744345442e-07,
"loss": 2.0723,
"step": 964
},
{
"epoch": 1.8815500853034366,
"grad_norm": 0.44266360998153687,
"learning_rate": 1.9568197395476352e-07,
"loss": 2.0134,
"step": 965
},
{
"epoch": 1.8834998781379477,
"grad_norm": 0.4508133828639984,
"learning_rate": 1.9533927347498287e-07,
"loss": 1.9951,
"step": 966
},
{
"epoch": 1.8854496709724593,
"grad_norm": 0.4255620539188385,
"learning_rate": 1.949965729952022e-07,
"loss": 1.9663,
"step": 967
},
{
"epoch": 1.8873994638069704,
"grad_norm": 0.45423394441604614,
"learning_rate": 1.946538725154215e-07,
"loss": 2.0072,
"step": 968
},
{
"epoch": 1.8893492566414818,
"grad_norm": 0.4226663112640381,
"learning_rate": 1.9431117203564086e-07,
"loss": 1.9598,
"step": 969
},
{
"epoch": 1.8912990494759931,
"grad_norm": 0.47366762161254883,
"learning_rate": 1.9396847155586015e-07,
"loss": 1.9927,
"step": 970
},
{
"epoch": 1.8932488423105045,
"grad_norm": 0.44758790731430054,
"learning_rate": 1.936257710760795e-07,
"loss": 1.9628,
"step": 971
},
{
"epoch": 1.8951986351450159,
"grad_norm": 0.48197463154792786,
"learning_rate": 1.9328307059629885e-07,
"loss": 2.1004,
"step": 972
},
{
"epoch": 1.8971484279795272,
"grad_norm": 0.4538448750972748,
"learning_rate": 1.9294037011651814e-07,
"loss": 2.0199,
"step": 973
},
{
"epoch": 1.8990982208140386,
"grad_norm": 0.47362738847732544,
"learning_rate": 1.925976696367375e-07,
"loss": 2.0746,
"step": 974
},
{
"epoch": 1.9010480136485497,
"grad_norm": 0.47095638513565063,
"learning_rate": 1.922549691569568e-07,
"loss": 1.9897,
"step": 975
},
{
"epoch": 1.9029978064830613,
"grad_norm": 0.4763641059398651,
"learning_rate": 1.9191226867717613e-07,
"loss": 2.0156,
"step": 976
},
{
"epoch": 1.9049475993175724,
"grad_norm": 0.4224942922592163,
"learning_rate": 1.9156956819739548e-07,
"loss": 2.0114,
"step": 977
},
{
"epoch": 1.906897392152084,
"grad_norm": 0.44930440187454224,
"learning_rate": 1.912268677176148e-07,
"loss": 2.0121,
"step": 978
},
{
"epoch": 1.9088471849865951,
"grad_norm": 0.45916110277175903,
"learning_rate": 1.9088416723783412e-07,
"loss": 2.0053,
"step": 979
},
{
"epoch": 1.9107969778211065,
"grad_norm": 0.42759600281715393,
"learning_rate": 1.9054146675805347e-07,
"loss": 2.0109,
"step": 980
},
{
"epoch": 1.9127467706556178,
"grad_norm": 0.49347975850105286,
"learning_rate": 1.901987662782728e-07,
"loss": 2.0657,
"step": 981
},
{
"epoch": 1.9146965634901292,
"grad_norm": 0.4315294027328491,
"learning_rate": 1.898560657984921e-07,
"loss": 1.9473,
"step": 982
},
{
"epoch": 1.9166463563246405,
"grad_norm": 0.42915600538253784,
"learning_rate": 1.8951336531871143e-07,
"loss": 1.9958,
"step": 983
},
{
"epoch": 1.9185961491591519,
"grad_norm": 0.48152124881744385,
"learning_rate": 1.8917066483893078e-07,
"loss": 2.0815,
"step": 984
},
{
"epoch": 1.9205459419936632,
"grad_norm": 0.44423532485961914,
"learning_rate": 1.888279643591501e-07,
"loss": 2.0227,
"step": 985
},
{
"epoch": 1.9224957348281744,
"grad_norm": 0.4499359130859375,
"learning_rate": 1.8848526387936942e-07,
"loss": 1.961,
"step": 986
},
{
"epoch": 1.924445527662686,
"grad_norm": 0.4560549855232239,
"learning_rate": 1.8814256339958877e-07,
"loss": 2.03,
"step": 987
},
{
"epoch": 1.926395320497197,
"grad_norm": 0.48396381735801697,
"learning_rate": 1.8779986291980806e-07,
"loss": 1.985,
"step": 988
},
{
"epoch": 1.9283451133317087,
"grad_norm": 0.456910103559494,
"learning_rate": 1.874571624400274e-07,
"loss": 1.9802,
"step": 989
},
{
"epoch": 1.9302949061662198,
"grad_norm": 0.46041303873062134,
"learning_rate": 1.8711446196024676e-07,
"loss": 1.9507,
"step": 990
},
{
"epoch": 1.9322446990007311,
"grad_norm": 0.4496663510799408,
"learning_rate": 1.8677176148046605e-07,
"loss": 2.0329,
"step": 991
},
{
"epoch": 1.9341944918352425,
"grad_norm": 0.4381345212459564,
"learning_rate": 1.864290610006854e-07,
"loss": 1.9643,
"step": 992
},
{
"epoch": 1.9361442846697539,
"grad_norm": 0.43699464201927185,
"learning_rate": 1.8608636052090475e-07,
"loss": 2.026,
"step": 993
},
{
"epoch": 1.9380940775042652,
"grad_norm": 0.4496040344238281,
"learning_rate": 1.8574366004112404e-07,
"loss": 1.9318,
"step": 994
},
{
"epoch": 1.9400438703387763,
"grad_norm": 0.45028945803642273,
"learning_rate": 1.854009595613434e-07,
"loss": 2.0254,
"step": 995
},
{
"epoch": 1.941993663173288,
"grad_norm": 0.46241873502731323,
"learning_rate": 1.8505825908156268e-07,
"loss": 2.0224,
"step": 996
},
{
"epoch": 1.943943456007799,
"grad_norm": 0.4494277238845825,
"learning_rate": 1.8471555860178203e-07,
"loss": 2.0734,
"step": 997
},
{
"epoch": 1.9458932488423106,
"grad_norm": 0.44225579500198364,
"learning_rate": 1.8437285812200138e-07,
"loss": 2.0548,
"step": 998
},
{
"epoch": 1.9478430416768218,
"grad_norm": 0.4850820004940033,
"learning_rate": 1.8403015764222067e-07,
"loss": 1.9961,
"step": 999
},
{
"epoch": 1.9497928345113331,
"grad_norm": 0.46442610025405884,
"learning_rate": 1.8368745716244002e-07,
"loss": 1.9777,
"step": 1000
},
{
"epoch": 1.9517426273458445,
"grad_norm": 0.457109272480011,
"learning_rate": 1.8334475668265934e-07,
"loss": 2.0949,
"step": 1001
},
{
"epoch": 1.9536924201803558,
"grad_norm": 0.4514349699020386,
"learning_rate": 1.8300205620287866e-07,
"loss": 2.0933,
"step": 1002
},
{
"epoch": 1.9556422130148672,
"grad_norm": 0.4601777195930481,
"learning_rate": 1.82659355723098e-07,
"loss": 1.9975,
"step": 1003
},
{
"epoch": 1.9575920058493785,
"grad_norm": 0.4604569673538208,
"learning_rate": 1.8231665524331733e-07,
"loss": 2.0364,
"step": 1004
},
{
"epoch": 1.95954179868389,
"grad_norm": 0.4434170424938202,
"learning_rate": 1.8197395476353665e-07,
"loss": 1.9835,
"step": 1005
},
{
"epoch": 1.961491591518401,
"grad_norm": 0.45063334703445435,
"learning_rate": 1.81631254283756e-07,
"loss": 1.9904,
"step": 1006
},
{
"epoch": 1.9634413843529126,
"grad_norm": 0.45276153087615967,
"learning_rate": 1.8128855380397532e-07,
"loss": 2.021,
"step": 1007
},
{
"epoch": 1.9653911771874237,
"grad_norm": 0.44774502515792847,
"learning_rate": 1.8094585332419464e-07,
"loss": 2.0024,
"step": 1008
},
{
"epoch": 1.9673409700219353,
"grad_norm": 0.43734362721443176,
"learning_rate": 1.8060315284441396e-07,
"loss": 2.0261,
"step": 1009
},
{
"epoch": 1.9692907628564464,
"grad_norm": 0.45293501019477844,
"learning_rate": 1.802604523646333e-07,
"loss": 2.0781,
"step": 1010
},
{
"epoch": 1.9712405556909578,
"grad_norm": 0.4538004994392395,
"learning_rate": 1.7991775188485263e-07,
"loss": 2.0081,
"step": 1011
},
{
"epoch": 1.9731903485254692,
"grad_norm": 0.45042964816093445,
"learning_rate": 1.7957505140507195e-07,
"loss": 2.0121,
"step": 1012
},
{
"epoch": 1.9751401413599805,
"grad_norm": 0.4721399247646332,
"learning_rate": 1.792323509252913e-07,
"loss": 2.0071,
"step": 1013
},
{
"epoch": 1.9770899341944919,
"grad_norm": 0.4297287166118622,
"learning_rate": 1.788896504455106e-07,
"loss": 2.0213,
"step": 1014
},
{
"epoch": 1.9790397270290032,
"grad_norm": 0.4454828202724457,
"learning_rate": 1.7854694996572994e-07,
"loss": 2.0093,
"step": 1015
},
{
"epoch": 1.9809895198635146,
"grad_norm": 0.4550788700580597,
"learning_rate": 1.782042494859493e-07,
"loss": 2.0599,
"step": 1016
},
{
"epoch": 1.9829393126980257,
"grad_norm": 0.44854849576950073,
"learning_rate": 1.7786154900616858e-07,
"loss": 2.0262,
"step": 1017
},
{
"epoch": 1.9848891055325373,
"grad_norm": 0.4477459192276001,
"learning_rate": 1.7751884852638793e-07,
"loss": 1.9533,
"step": 1018
},
{
"epoch": 1.9868388983670484,
"grad_norm": 0.43663471937179565,
"learning_rate": 1.7717614804660728e-07,
"loss": 2.0122,
"step": 1019
},
{
"epoch": 1.98878869120156,
"grad_norm": 0.45281800627708435,
"learning_rate": 1.7683344756682657e-07,
"loss": 2.0711,
"step": 1020
},
{
"epoch": 1.9907384840360711,
"grad_norm": 0.44143861532211304,
"learning_rate": 1.7649074708704592e-07,
"loss": 2.0198,
"step": 1021
},
{
"epoch": 1.9926882768705825,
"grad_norm": 0.4464763402938843,
"learning_rate": 1.7614804660726524e-07,
"loss": 2.0117,
"step": 1022
},
{
"epoch": 1.9946380697050938,
"grad_norm": 0.42707762122154236,
"learning_rate": 1.7580534612748456e-07,
"loss": 1.9629,
"step": 1023
},
{
"epoch": 1.9965878625396052,
"grad_norm": 0.4683617949485779,
"learning_rate": 1.754626456477039e-07,
"loss": 2.0467,
"step": 1024
},
{
"epoch": 1.9985376553741165,
"grad_norm": 0.4215565025806427,
"learning_rate": 1.7511994516792323e-07,
"loss": 1.9545,
"step": 1025
},
{
"epoch": 1.9985376553741165,
"eval_loss": 2.0196783542633057,
"eval_runtime": 480.5583,
"eval_samples_per_second": 1.294,
"eval_steps_per_second": 0.325,
"step": 1025
}
],
"logging_steps": 1,
"max_steps": 1536,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.0284206992292577e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}