beta-test-checkpoint-1000 / trainer_state.json
ImagineIt's picture
Upload folder using huggingface_hub
6605a26 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.30353619669145543,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00030353619669145547,
"grad_norm": 9.667811393737793,
"learning_rate": 1e-05,
"loss": 5.0202,
"step": 1
},
{
"epoch": 0.0006070723933829109,
"grad_norm": 10.303421974182129,
"learning_rate": 2e-05,
"loss": 4.7469,
"step": 2
},
{
"epoch": 0.0009106085900743664,
"grad_norm": 7.488056182861328,
"learning_rate": 3e-05,
"loss": 5.0105,
"step": 3
},
{
"epoch": 0.0012141447867658219,
"grad_norm": 4.885837078094482,
"learning_rate": 4e-05,
"loss": 4.3945,
"step": 4
},
{
"epoch": 0.0015176809834572772,
"grad_norm": 3.793656587600708,
"learning_rate": 5e-05,
"loss": 4.0574,
"step": 5
},
{
"epoch": 0.0018212171801487327,
"grad_norm": 3.9249916076660156,
"learning_rate": 6e-05,
"loss": 3.8179,
"step": 6
},
{
"epoch": 0.002124753376840188,
"grad_norm": 3.4937145709991455,
"learning_rate": 7e-05,
"loss": 3.5297,
"step": 7
},
{
"epoch": 0.0024282895735316438,
"grad_norm": 2.499041795730591,
"learning_rate": 8e-05,
"loss": 3.15,
"step": 8
},
{
"epoch": 0.002731825770223099,
"grad_norm": 2.0781290531158447,
"learning_rate": 9e-05,
"loss": 2.8658,
"step": 9
},
{
"epoch": 0.0030353619669145544,
"grad_norm": 2.0124764442443848,
"learning_rate": 0.0001,
"loss": 2.6826,
"step": 10
},
{
"epoch": 0.00333889816360601,
"grad_norm": 1.4209256172180176,
"learning_rate": 9.99949377341298e-05,
"loss": 2.5608,
"step": 11
},
{
"epoch": 0.0036424343602974654,
"grad_norm": 3.176084041595459,
"learning_rate": 9.99898754682596e-05,
"loss": 2.2416,
"step": 12
},
{
"epoch": 0.003945970556988921,
"grad_norm": 1.4457614421844482,
"learning_rate": 9.998481320238939e-05,
"loss": 2.1925,
"step": 13
},
{
"epoch": 0.004249506753680376,
"grad_norm": 1.3989348411560059,
"learning_rate": 9.997975093651918e-05,
"loss": 2.2165,
"step": 14
},
{
"epoch": 0.004553042950371832,
"grad_norm": 1.0647027492523193,
"learning_rate": 9.997468867064899e-05,
"loss": 2.3486,
"step": 15
},
{
"epoch": 0.0048565791470632875,
"grad_norm": 1.0246940851211548,
"learning_rate": 9.996962640477879e-05,
"loss": 2.19,
"step": 16
},
{
"epoch": 0.005160115343754742,
"grad_norm": 1.029646873474121,
"learning_rate": 9.996456413890858e-05,
"loss": 2.4052,
"step": 17
},
{
"epoch": 0.005463651540446198,
"grad_norm": 1.322654128074646,
"learning_rate": 9.995950187303838e-05,
"loss": 2.1927,
"step": 18
},
{
"epoch": 0.005767187737137654,
"grad_norm": 2.061326026916504,
"learning_rate": 9.995443960716817e-05,
"loss": 2.4574,
"step": 19
},
{
"epoch": 0.006070723933829109,
"grad_norm": 1.1343607902526855,
"learning_rate": 9.994937734129797e-05,
"loss": 1.9598,
"step": 20
},
{
"epoch": 0.0063742601305205645,
"grad_norm": 1.13712477684021,
"learning_rate": 9.994431507542776e-05,
"loss": 2.8643,
"step": 21
},
{
"epoch": 0.00667779632721202,
"grad_norm": 0.8220421671867371,
"learning_rate": 9.993925280955756e-05,
"loss": 2.0474,
"step": 22
},
{
"epoch": 0.006981332523903475,
"grad_norm": 0.8233473300933838,
"learning_rate": 9.993419054368735e-05,
"loss": 2.3597,
"step": 23
},
{
"epoch": 0.007284868720594931,
"grad_norm": 0.8661925196647644,
"learning_rate": 9.992912827781716e-05,
"loss": 2.2163,
"step": 24
},
{
"epoch": 0.007588404917286387,
"grad_norm": 0.7995729446411133,
"learning_rate": 9.992406601194695e-05,
"loss": 1.8051,
"step": 25
},
{
"epoch": 0.007891941113977842,
"grad_norm": 0.810165286064148,
"learning_rate": 9.991900374607675e-05,
"loss": 1.9189,
"step": 26
},
{
"epoch": 0.008195477310669297,
"grad_norm": 0.8240752220153809,
"learning_rate": 9.991394148020654e-05,
"loss": 1.7,
"step": 27
},
{
"epoch": 0.008499013507360752,
"grad_norm": 1.0160635709762573,
"learning_rate": 9.990887921433634e-05,
"loss": 2.2964,
"step": 28
},
{
"epoch": 0.008802549704052209,
"grad_norm": 0.794966995716095,
"learning_rate": 9.990381694846613e-05,
"loss": 1.7333,
"step": 29
},
{
"epoch": 0.009106085900743664,
"grad_norm": 0.5594797134399414,
"learning_rate": 9.989875468259593e-05,
"loss": 2.0925,
"step": 30
},
{
"epoch": 0.009409622097435118,
"grad_norm": 0.8100740909576416,
"learning_rate": 9.989369241672572e-05,
"loss": 2.1218,
"step": 31
},
{
"epoch": 0.009713158294126575,
"grad_norm": 0.7057996392250061,
"learning_rate": 9.988863015085552e-05,
"loss": 2.005,
"step": 32
},
{
"epoch": 0.01001669449081803,
"grad_norm": 0.8970999121665955,
"learning_rate": 9.988356788498533e-05,
"loss": 2.2414,
"step": 33
},
{
"epoch": 0.010320230687509485,
"grad_norm": 0.6290627717971802,
"learning_rate": 9.987850561911512e-05,
"loss": 2.2422,
"step": 34
},
{
"epoch": 0.010623766884200941,
"grad_norm": 0.5665722489356995,
"learning_rate": 9.987344335324492e-05,
"loss": 1.9342,
"step": 35
},
{
"epoch": 0.010927303080892396,
"grad_norm": 0.5792561173439026,
"learning_rate": 9.986838108737472e-05,
"loss": 1.8733,
"step": 36
},
{
"epoch": 0.011230839277583851,
"grad_norm": 0.5264159440994263,
"learning_rate": 9.986331882150452e-05,
"loss": 2.1739,
"step": 37
},
{
"epoch": 0.011534375474275308,
"grad_norm": 0.5069584250450134,
"learning_rate": 9.985825655563431e-05,
"loss": 1.6235,
"step": 38
},
{
"epoch": 0.011837911670966763,
"grad_norm": 0.7689110636711121,
"learning_rate": 9.985319428976411e-05,
"loss": 1.711,
"step": 39
},
{
"epoch": 0.012141447867658217,
"grad_norm": 0.7001574635505676,
"learning_rate": 9.98481320238939e-05,
"loss": 1.651,
"step": 40
},
{
"epoch": 0.012444984064349674,
"grad_norm": 0.5615801811218262,
"learning_rate": 9.98430697580237e-05,
"loss": 2.128,
"step": 41
},
{
"epoch": 0.012748520261041129,
"grad_norm": 0.8766308426856995,
"learning_rate": 9.983800749215349e-05,
"loss": 2.4421,
"step": 42
},
{
"epoch": 0.013052056457732584,
"grad_norm": 0.704547107219696,
"learning_rate": 9.983294522628329e-05,
"loss": 1.6921,
"step": 43
},
{
"epoch": 0.01335559265442404,
"grad_norm": 0.5749143362045288,
"learning_rate": 9.982788296041308e-05,
"loss": 2.0173,
"step": 44
},
{
"epoch": 0.013659128851115495,
"grad_norm": 0.7929263710975647,
"learning_rate": 9.982282069454289e-05,
"loss": 2.1755,
"step": 45
},
{
"epoch": 0.01396266504780695,
"grad_norm": 1.6391934156417847,
"learning_rate": 9.981775842867269e-05,
"loss": 2.4995,
"step": 46
},
{
"epoch": 0.014266201244498407,
"grad_norm": 0.49616461992263794,
"learning_rate": 9.981269616280248e-05,
"loss": 2.3363,
"step": 47
},
{
"epoch": 0.014569737441189862,
"grad_norm": 0.614272952079773,
"learning_rate": 9.980763389693227e-05,
"loss": 2.0277,
"step": 48
},
{
"epoch": 0.014873273637881317,
"grad_norm": 0.6181132197380066,
"learning_rate": 9.980257163106207e-05,
"loss": 2.2867,
"step": 49
},
{
"epoch": 0.015176809834572773,
"grad_norm": 0.5342630743980408,
"learning_rate": 9.979750936519186e-05,
"loss": 1.7314,
"step": 50
},
{
"epoch": 0.015480346031264228,
"grad_norm": 0.4582519233226776,
"learning_rate": 9.979244709932166e-05,
"loss": 1.9893,
"step": 51
},
{
"epoch": 0.015783882227955685,
"grad_norm": 0.5448606014251709,
"learning_rate": 9.978738483345145e-05,
"loss": 2.3266,
"step": 52
},
{
"epoch": 0.01608741842464714,
"grad_norm": 1.0823545455932617,
"learning_rate": 9.978232256758125e-05,
"loss": 2.1919,
"step": 53
},
{
"epoch": 0.016390954621338594,
"grad_norm": 0.5506464838981628,
"learning_rate": 9.977726030171106e-05,
"loss": 2.0735,
"step": 54
},
{
"epoch": 0.01669449081803005,
"grad_norm": 0.568626344203949,
"learning_rate": 9.977219803584085e-05,
"loss": 2.051,
"step": 55
},
{
"epoch": 0.016998027014721504,
"grad_norm": 0.512907087802887,
"learning_rate": 9.976713576997065e-05,
"loss": 1.6473,
"step": 56
},
{
"epoch": 0.017301563211412962,
"grad_norm": 0.5541898012161255,
"learning_rate": 9.976207350410044e-05,
"loss": 1.8184,
"step": 57
},
{
"epoch": 0.017605099408104417,
"grad_norm": 0.5083638429641724,
"learning_rate": 9.975701123823024e-05,
"loss": 1.7573,
"step": 58
},
{
"epoch": 0.017908635604795872,
"grad_norm": 0.4722895920276642,
"learning_rate": 9.975194897236003e-05,
"loss": 2.0311,
"step": 59
},
{
"epoch": 0.018212171801487327,
"grad_norm": 0.5068002343177795,
"learning_rate": 9.974688670648983e-05,
"loss": 2.1245,
"step": 60
},
{
"epoch": 0.018515707998178782,
"grad_norm": 0.5726852416992188,
"learning_rate": 9.974182444061962e-05,
"loss": 2.1017,
"step": 61
},
{
"epoch": 0.018819244194870237,
"grad_norm": 0.5240160226821899,
"learning_rate": 9.973676217474942e-05,
"loss": 2.2665,
"step": 62
},
{
"epoch": 0.019122780391561695,
"grad_norm": 0.4728144705295563,
"learning_rate": 9.973169990887921e-05,
"loss": 2.0537,
"step": 63
},
{
"epoch": 0.01942631658825315,
"grad_norm": 0.47115418314933777,
"learning_rate": 9.972663764300902e-05,
"loss": 1.2815,
"step": 64
},
{
"epoch": 0.019729852784944605,
"grad_norm": 0.7070208191871643,
"learning_rate": 9.972157537713881e-05,
"loss": 1.8514,
"step": 65
},
{
"epoch": 0.02003338898163606,
"grad_norm": 0.529069185256958,
"learning_rate": 9.971651311126861e-05,
"loss": 1.7602,
"step": 66
},
{
"epoch": 0.020336925178327515,
"grad_norm": 0.7532087564468384,
"learning_rate": 9.97114508453984e-05,
"loss": 2.2168,
"step": 67
},
{
"epoch": 0.02064046137501897,
"grad_norm": 0.5654622912406921,
"learning_rate": 9.97063885795282e-05,
"loss": 1.9634,
"step": 68
},
{
"epoch": 0.020943997571710428,
"grad_norm": 0.701452910900116,
"learning_rate": 9.970132631365799e-05,
"loss": 2.044,
"step": 69
},
{
"epoch": 0.021247533768401883,
"grad_norm": 0.5750812888145447,
"learning_rate": 9.969626404778779e-05,
"loss": 1.8015,
"step": 70
},
{
"epoch": 0.021551069965093338,
"grad_norm": 0.49930402636528015,
"learning_rate": 9.969120178191758e-05,
"loss": 1.7998,
"step": 71
},
{
"epoch": 0.021854606161784793,
"grad_norm": 0.4348014295101166,
"learning_rate": 9.968613951604738e-05,
"loss": 1.9959,
"step": 72
},
{
"epoch": 0.022158142358476247,
"grad_norm": 0.5268503427505493,
"learning_rate": 9.968107725017719e-05,
"loss": 1.8497,
"step": 73
},
{
"epoch": 0.022461678555167702,
"grad_norm": 0.578822135925293,
"learning_rate": 9.967601498430698e-05,
"loss": 2.3277,
"step": 74
},
{
"epoch": 0.02276521475185916,
"grad_norm": 0.52215975522995,
"learning_rate": 9.967095271843677e-05,
"loss": 2.1179,
"step": 75
},
{
"epoch": 0.023068750948550616,
"grad_norm": 0.4557477533817291,
"learning_rate": 9.966589045256657e-05,
"loss": 2.0132,
"step": 76
},
{
"epoch": 0.02337228714524207,
"grad_norm": 0.5032123327255249,
"learning_rate": 9.966082818669638e-05,
"loss": 1.8608,
"step": 77
},
{
"epoch": 0.023675823341933525,
"grad_norm": 0.42689865827560425,
"learning_rate": 9.965576592082617e-05,
"loss": 2.0437,
"step": 78
},
{
"epoch": 0.02397935953862498,
"grad_norm": 0.44310206174850464,
"learning_rate": 9.965070365495597e-05,
"loss": 2.1222,
"step": 79
},
{
"epoch": 0.024282895735316435,
"grad_norm": 0.4377008378505707,
"learning_rate": 9.964564138908576e-05,
"loss": 2.0418,
"step": 80
},
{
"epoch": 0.024586431932007893,
"grad_norm": 0.35174912214279175,
"learning_rate": 9.964057912321556e-05,
"loss": 1.6931,
"step": 81
},
{
"epoch": 0.024889968128699348,
"grad_norm": 0.47877687215805054,
"learning_rate": 9.963551685734535e-05,
"loss": 1.7049,
"step": 82
},
{
"epoch": 0.025193504325390803,
"grad_norm": 0.4063829183578491,
"learning_rate": 9.963045459147515e-05,
"loss": 1.8611,
"step": 83
},
{
"epoch": 0.025497040522082258,
"grad_norm": 0.4149170219898224,
"learning_rate": 9.962539232560496e-05,
"loss": 1.9439,
"step": 84
},
{
"epoch": 0.025800576718773713,
"grad_norm": 0.4882602393627167,
"learning_rate": 9.962033005973475e-05,
"loss": 1.5723,
"step": 85
},
{
"epoch": 0.026104112915465168,
"grad_norm": 0.4600992202758789,
"learning_rate": 9.961526779386454e-05,
"loss": 2.0142,
"step": 86
},
{
"epoch": 0.026407649112156626,
"grad_norm": 0.43366697430610657,
"learning_rate": 9.961020552799434e-05,
"loss": 1.9175,
"step": 87
},
{
"epoch": 0.02671118530884808,
"grad_norm": 0.501487135887146,
"learning_rate": 9.960514326212413e-05,
"loss": 1.5043,
"step": 88
},
{
"epoch": 0.027014721505539536,
"grad_norm": 0.43821993470191956,
"learning_rate": 9.960008099625393e-05,
"loss": 1.8622,
"step": 89
},
{
"epoch": 0.02731825770223099,
"grad_norm": 0.4433805048465729,
"learning_rate": 9.959501873038372e-05,
"loss": 1.9459,
"step": 90
},
{
"epoch": 0.027621793898922446,
"grad_norm": 0.4686216115951538,
"learning_rate": 9.958995646451352e-05,
"loss": 1.7405,
"step": 91
},
{
"epoch": 0.0279253300956139,
"grad_norm": 0.48586198687553406,
"learning_rate": 9.958489419864331e-05,
"loss": 2.2233,
"step": 92
},
{
"epoch": 0.02822886629230536,
"grad_norm": 0.4018734097480774,
"learning_rate": 9.957983193277312e-05,
"loss": 2.0027,
"step": 93
},
{
"epoch": 0.028532402488996814,
"grad_norm": 0.4996435344219208,
"learning_rate": 9.957476966690292e-05,
"loss": 1.5949,
"step": 94
},
{
"epoch": 0.02883593868568827,
"grad_norm": 0.45447826385498047,
"learning_rate": 9.956970740103271e-05,
"loss": 1.7636,
"step": 95
},
{
"epoch": 0.029139474882379723,
"grad_norm": 0.4209904372692108,
"learning_rate": 9.95646451351625e-05,
"loss": 1.7523,
"step": 96
},
{
"epoch": 0.029443011079071178,
"grad_norm": 0.3740164637565613,
"learning_rate": 9.95595828692923e-05,
"loss": 1.9136,
"step": 97
},
{
"epoch": 0.029746547275762633,
"grad_norm": 0.4169963598251343,
"learning_rate": 9.95545206034221e-05,
"loss": 1.9136,
"step": 98
},
{
"epoch": 0.03005008347245409,
"grad_norm": 0.4683006703853607,
"learning_rate": 9.954945833755189e-05,
"loss": 2.0657,
"step": 99
},
{
"epoch": 0.030353619669145546,
"grad_norm": 0.4508633017539978,
"learning_rate": 9.954439607168169e-05,
"loss": 2.1099,
"step": 100
},
{
"epoch": 0.030657155865837,
"grad_norm": 0.4136218726634979,
"learning_rate": 9.953933380581148e-05,
"loss": 2.0183,
"step": 101
},
{
"epoch": 0.030960692062528456,
"grad_norm": 0.44510790705680847,
"learning_rate": 9.953427153994127e-05,
"loss": 1.9307,
"step": 102
},
{
"epoch": 0.031264228259219914,
"grad_norm": 0.3713892698287964,
"learning_rate": 9.952920927407108e-05,
"loss": 1.7017,
"step": 103
},
{
"epoch": 0.03156776445591137,
"grad_norm": 0.47902294993400574,
"learning_rate": 9.952414700820088e-05,
"loss": 2.1172,
"step": 104
},
{
"epoch": 0.031871300652602824,
"grad_norm": 0.4492317736148834,
"learning_rate": 9.951908474233067e-05,
"loss": 1.9752,
"step": 105
},
{
"epoch": 0.03217483684929428,
"grad_norm": 0.4096255302429199,
"learning_rate": 9.951402247646047e-05,
"loss": 1.5511,
"step": 106
},
{
"epoch": 0.032478373045985734,
"grad_norm": 0.39630818367004395,
"learning_rate": 9.950896021059026e-05,
"loss": 2.11,
"step": 107
},
{
"epoch": 0.03278190924267719,
"grad_norm": 0.42648032307624817,
"learning_rate": 9.950389794472006e-05,
"loss": 2.1784,
"step": 108
},
{
"epoch": 0.033085445439368644,
"grad_norm": 0.4814178943634033,
"learning_rate": 9.949883567884985e-05,
"loss": 1.955,
"step": 109
},
{
"epoch": 0.0333889816360601,
"grad_norm": 0.41600191593170166,
"learning_rate": 9.949377341297965e-05,
"loss": 1.9163,
"step": 110
},
{
"epoch": 0.03369251783275155,
"grad_norm": 0.4610773026943207,
"learning_rate": 9.948871114710944e-05,
"loss": 1.7934,
"step": 111
},
{
"epoch": 0.03399605402944301,
"grad_norm": 0.43061718344688416,
"learning_rate": 9.948364888123925e-05,
"loss": 1.9278,
"step": 112
},
{
"epoch": 0.03429959022613446,
"grad_norm": 0.3907497227191925,
"learning_rate": 9.947858661536904e-05,
"loss": 1.996,
"step": 113
},
{
"epoch": 0.034603126422825925,
"grad_norm": 0.3984166383743286,
"learning_rate": 9.947352434949884e-05,
"loss": 1.5936,
"step": 114
},
{
"epoch": 0.03490666261951738,
"grad_norm": 0.43406423926353455,
"learning_rate": 9.946846208362863e-05,
"loss": 1.8866,
"step": 115
},
{
"epoch": 0.035210198816208835,
"grad_norm": 0.45913639664649963,
"learning_rate": 9.946339981775843e-05,
"loss": 1.972,
"step": 116
},
{
"epoch": 0.03551373501290029,
"grad_norm": 0.42077311873435974,
"learning_rate": 9.945833755188822e-05,
"loss": 2.0081,
"step": 117
},
{
"epoch": 0.035817271209591744,
"grad_norm": 0.41479435563087463,
"learning_rate": 9.945327528601802e-05,
"loss": 2.0096,
"step": 118
},
{
"epoch": 0.0361208074062832,
"grad_norm": 0.35669025778770447,
"learning_rate": 9.944821302014781e-05,
"loss": 2.0074,
"step": 119
},
{
"epoch": 0.036424343602974654,
"grad_norm": 0.4088069796562195,
"learning_rate": 9.944315075427761e-05,
"loss": 1.817,
"step": 120
},
{
"epoch": 0.03672787979966611,
"grad_norm": 0.49982163310050964,
"learning_rate": 9.943808848840742e-05,
"loss": 1.9218,
"step": 121
},
{
"epoch": 0.037031415996357564,
"grad_norm": 0.39924055337905884,
"learning_rate": 9.943302622253721e-05,
"loss": 2.2463,
"step": 122
},
{
"epoch": 0.03733495219304902,
"grad_norm": 0.40462177991867065,
"learning_rate": 9.942796395666702e-05,
"loss": 2.0844,
"step": 123
},
{
"epoch": 0.037638488389740474,
"grad_norm": 0.43440741300582886,
"learning_rate": 9.942290169079681e-05,
"loss": 1.8808,
"step": 124
},
{
"epoch": 0.03794202458643193,
"grad_norm": 0.4029730260372162,
"learning_rate": 9.941783942492661e-05,
"loss": 1.9427,
"step": 125
},
{
"epoch": 0.03824556078312339,
"grad_norm": 0.7807103395462036,
"learning_rate": 9.94127771590564e-05,
"loss": 1.9072,
"step": 126
},
{
"epoch": 0.038549096979814845,
"grad_norm": 0.5021561980247498,
"learning_rate": 9.94077148931862e-05,
"loss": 2.0582,
"step": 127
},
{
"epoch": 0.0388526331765063,
"grad_norm": 0.5161197781562805,
"learning_rate": 9.9402652627316e-05,
"loss": 1.9861,
"step": 128
},
{
"epoch": 0.039156169373197755,
"grad_norm": 0.5553935766220093,
"learning_rate": 9.939759036144579e-05,
"loss": 2.1893,
"step": 129
},
{
"epoch": 0.03945970556988921,
"grad_norm": 0.4241655170917511,
"learning_rate": 9.939252809557558e-05,
"loss": 1.9722,
"step": 130
},
{
"epoch": 0.039763241766580665,
"grad_norm": 0.43290001153945923,
"learning_rate": 9.938746582970538e-05,
"loss": 1.5364,
"step": 131
},
{
"epoch": 0.04006677796327212,
"grad_norm": 0.40089091658592224,
"learning_rate": 9.938240356383519e-05,
"loss": 1.9686,
"step": 132
},
{
"epoch": 0.040370314159963575,
"grad_norm": 0.4152032434940338,
"learning_rate": 9.937734129796498e-05,
"loss": 1.913,
"step": 133
},
{
"epoch": 0.04067385035665503,
"grad_norm": 0.4443211555480957,
"learning_rate": 9.937227903209478e-05,
"loss": 2.2354,
"step": 134
},
{
"epoch": 0.040977386553346484,
"grad_norm": 0.41355323791503906,
"learning_rate": 9.936721676622457e-05,
"loss": 2.1055,
"step": 135
},
{
"epoch": 0.04128092275003794,
"grad_norm": 0.5837479829788208,
"learning_rate": 9.936215450035437e-05,
"loss": 1.9085,
"step": 136
},
{
"epoch": 0.041584458946729394,
"grad_norm": 0.40269389748573303,
"learning_rate": 9.935709223448416e-05,
"loss": 2.0368,
"step": 137
},
{
"epoch": 0.041887995143420856,
"grad_norm": 0.5898969769477844,
"learning_rate": 9.935202996861396e-05,
"loss": 1.7933,
"step": 138
},
{
"epoch": 0.04219153134011231,
"grad_norm": 0.41117680072784424,
"learning_rate": 9.934696770274375e-05,
"loss": 1.7452,
"step": 139
},
{
"epoch": 0.042495067536803766,
"grad_norm": 0.5090368390083313,
"learning_rate": 9.934190543687354e-05,
"loss": 2.0141,
"step": 140
},
{
"epoch": 0.04279860373349522,
"grad_norm": 0.4821307957172394,
"learning_rate": 9.933684317100334e-05,
"loss": 1.9443,
"step": 141
},
{
"epoch": 0.043102139930186675,
"grad_norm": 0.41939428448677063,
"learning_rate": 9.933178090513315e-05,
"loss": 1.7401,
"step": 142
},
{
"epoch": 0.04340567612687813,
"grad_norm": 0.4531096816062927,
"learning_rate": 9.932671863926294e-05,
"loss": 1.9944,
"step": 143
},
{
"epoch": 0.043709212323569585,
"grad_norm": 0.44440799951553345,
"learning_rate": 9.932165637339274e-05,
"loss": 1.9648,
"step": 144
},
{
"epoch": 0.04401274852026104,
"grad_norm": 0.36847150325775146,
"learning_rate": 9.931659410752253e-05,
"loss": 2.0638,
"step": 145
},
{
"epoch": 0.044316284716952495,
"grad_norm": 0.6394171118736267,
"learning_rate": 9.931153184165233e-05,
"loss": 1.9476,
"step": 146
},
{
"epoch": 0.04461982091364395,
"grad_norm": 0.41597506403923035,
"learning_rate": 9.930646957578212e-05,
"loss": 1.535,
"step": 147
},
{
"epoch": 0.044923357110335405,
"grad_norm": 0.5597077012062073,
"learning_rate": 9.930140730991192e-05,
"loss": 1.6826,
"step": 148
},
{
"epoch": 0.045226893307026866,
"grad_norm": 0.5532084703445435,
"learning_rate": 9.929634504404171e-05,
"loss": 1.8063,
"step": 149
},
{
"epoch": 0.04553042950371832,
"grad_norm": 0.467339426279068,
"learning_rate": 9.92912827781715e-05,
"loss": 2.017,
"step": 150
},
{
"epoch": 0.045833965700409776,
"grad_norm": 0.4054040312767029,
"learning_rate": 9.928622051230131e-05,
"loss": 1.7582,
"step": 151
},
{
"epoch": 0.04613750189710123,
"grad_norm": 1.2743823528289795,
"learning_rate": 9.928115824643111e-05,
"loss": 2.0202,
"step": 152
},
{
"epoch": 0.046441038093792686,
"grad_norm": 0.4357397258281708,
"learning_rate": 9.92760959805609e-05,
"loss": 1.8788,
"step": 153
},
{
"epoch": 0.04674457429048414,
"grad_norm": 2.8793208599090576,
"learning_rate": 9.92710337146907e-05,
"loss": 2.1204,
"step": 154
},
{
"epoch": 0.047048110487175596,
"grad_norm": 0.9585952162742615,
"learning_rate": 9.92659714488205e-05,
"loss": 1.9356,
"step": 155
},
{
"epoch": 0.04735164668386705,
"grad_norm": 0.7857603430747986,
"learning_rate": 9.926090918295029e-05,
"loss": 1.9097,
"step": 156
},
{
"epoch": 0.047655182880558505,
"grad_norm": 0.5259221792221069,
"learning_rate": 9.925584691708008e-05,
"loss": 2.1589,
"step": 157
},
{
"epoch": 0.04795871907724996,
"grad_norm": 2.793253183364868,
"learning_rate": 9.925078465120988e-05,
"loss": 1.7202,
"step": 158
},
{
"epoch": 0.048262255273941415,
"grad_norm": 0.4432888627052307,
"learning_rate": 9.924572238533967e-05,
"loss": 1.9898,
"step": 159
},
{
"epoch": 0.04856579147063287,
"grad_norm": 0.4347291588783264,
"learning_rate": 9.924066011946948e-05,
"loss": 1.8142,
"step": 160
},
{
"epoch": 0.04886932766732433,
"grad_norm": 5.273514747619629,
"learning_rate": 9.923559785359928e-05,
"loss": 1.8665,
"step": 161
},
{
"epoch": 0.04917286386401579,
"grad_norm": 0.47988301515579224,
"learning_rate": 9.923053558772907e-05,
"loss": 1.9439,
"step": 162
},
{
"epoch": 0.04947640006070724,
"grad_norm": 0.3584117293357849,
"learning_rate": 9.922547332185887e-05,
"loss": 1.8109,
"step": 163
},
{
"epoch": 0.049779936257398696,
"grad_norm": 0.4074074923992157,
"learning_rate": 9.922041105598866e-05,
"loss": 2.1056,
"step": 164
},
{
"epoch": 0.05008347245409015,
"grad_norm": 3.159336566925049,
"learning_rate": 9.921534879011846e-05,
"loss": 1.8672,
"step": 165
},
{
"epoch": 0.050387008650781606,
"grad_norm": 0.38132309913635254,
"learning_rate": 9.921028652424826e-05,
"loss": 1.8423,
"step": 166
},
{
"epoch": 0.05069054484747306,
"grad_norm": 0.39241936802864075,
"learning_rate": 9.920522425837806e-05,
"loss": 1.5949,
"step": 167
},
{
"epoch": 0.050994081044164516,
"grad_norm": 0.38212037086486816,
"learning_rate": 9.920016199250785e-05,
"loss": 1.9669,
"step": 168
},
{
"epoch": 0.05129761724085597,
"grad_norm": 0.5353955030441284,
"learning_rate": 9.919509972663765e-05,
"loss": 2.1806,
"step": 169
},
{
"epoch": 0.051601153437547426,
"grad_norm": 0.4129483699798584,
"learning_rate": 9.919003746076744e-05,
"loss": 1.8858,
"step": 170
},
{
"epoch": 0.05190468963423888,
"grad_norm": 0.3832380771636963,
"learning_rate": 9.918497519489725e-05,
"loss": 2.0321,
"step": 171
},
{
"epoch": 0.052208225830930335,
"grad_norm": 0.4078863859176636,
"learning_rate": 9.917991292902705e-05,
"loss": 1.6213,
"step": 172
},
{
"epoch": 0.0525117620276218,
"grad_norm": 0.38865014910697937,
"learning_rate": 9.917485066315684e-05,
"loss": 2.0052,
"step": 173
},
{
"epoch": 0.05281529822431325,
"grad_norm": 0.4339440166950226,
"learning_rate": 9.916978839728664e-05,
"loss": 2.2405,
"step": 174
},
{
"epoch": 0.05311883442100471,
"grad_norm": 0.42063045501708984,
"learning_rate": 9.916472613141643e-05,
"loss": 1.6529,
"step": 175
},
{
"epoch": 0.05342237061769616,
"grad_norm": 0.4765849709510803,
"learning_rate": 9.915966386554623e-05,
"loss": 1.9645,
"step": 176
},
{
"epoch": 0.05372590681438762,
"grad_norm": 0.41431936621665955,
"learning_rate": 9.915460159967602e-05,
"loss": 1.9709,
"step": 177
},
{
"epoch": 0.05402944301107907,
"grad_norm": 0.3591434359550476,
"learning_rate": 9.914953933380581e-05,
"loss": 1.685,
"step": 178
},
{
"epoch": 0.054332979207770526,
"grad_norm": 0.45483240485191345,
"learning_rate": 9.914447706793561e-05,
"loss": 1.9362,
"step": 179
},
{
"epoch": 0.05463651540446198,
"grad_norm": 0.5468000173568726,
"learning_rate": 9.91394148020654e-05,
"loss": 1.6984,
"step": 180
},
{
"epoch": 0.054940051601153436,
"grad_norm": 0.4057190716266632,
"learning_rate": 9.913435253619521e-05,
"loss": 1.9887,
"step": 181
},
{
"epoch": 0.05524358779784489,
"grad_norm": 0.383211612701416,
"learning_rate": 9.912929027032501e-05,
"loss": 1.7825,
"step": 182
},
{
"epoch": 0.055547123994536346,
"grad_norm": 0.3480004668235779,
"learning_rate": 9.91242280044548e-05,
"loss": 1.8721,
"step": 183
},
{
"epoch": 0.0558506601912278,
"grad_norm": 0.47680413722991943,
"learning_rate": 9.91191657385846e-05,
"loss": 1.8113,
"step": 184
},
{
"epoch": 0.05615419638791926,
"grad_norm": 0.37727096676826477,
"learning_rate": 9.911410347271439e-05,
"loss": 1.7398,
"step": 185
},
{
"epoch": 0.05645773258461072,
"grad_norm": 0.47738176584243774,
"learning_rate": 9.910904120684419e-05,
"loss": 1.4651,
"step": 186
},
{
"epoch": 0.05676126878130217,
"grad_norm": 0.44533729553222656,
"learning_rate": 9.910397894097398e-05,
"loss": 1.5697,
"step": 187
},
{
"epoch": 0.05706480497799363,
"grad_norm": 0.45051974058151245,
"learning_rate": 9.909891667510378e-05,
"loss": 2.1577,
"step": 188
},
{
"epoch": 0.05736834117468508,
"grad_norm": 0.4709470272064209,
"learning_rate": 9.909385440923357e-05,
"loss": 2.0486,
"step": 189
},
{
"epoch": 0.05767187737137654,
"grad_norm": 0.4063846170902252,
"learning_rate": 9.908879214336338e-05,
"loss": 1.5453,
"step": 190
},
{
"epoch": 0.05797541356806799,
"grad_norm": 0.374362587928772,
"learning_rate": 9.908372987749317e-05,
"loss": 1.5611,
"step": 191
},
{
"epoch": 0.05827894976475945,
"grad_norm": 0.4852111041545868,
"learning_rate": 9.907866761162297e-05,
"loss": 1.6234,
"step": 192
},
{
"epoch": 0.0585824859614509,
"grad_norm": 0.6863122582435608,
"learning_rate": 9.907360534575276e-05,
"loss": 2.1612,
"step": 193
},
{
"epoch": 0.058886022158142357,
"grad_norm": 0.6040588021278381,
"learning_rate": 9.906854307988256e-05,
"loss": 2.1092,
"step": 194
},
{
"epoch": 0.05918955835483381,
"grad_norm": 0.4148467779159546,
"learning_rate": 9.906348081401235e-05,
"loss": 2.1108,
"step": 195
},
{
"epoch": 0.059493094551525266,
"grad_norm": 0.36098209023475647,
"learning_rate": 9.905841854814215e-05,
"loss": 2.0002,
"step": 196
},
{
"epoch": 0.05979663074821673,
"grad_norm": 0.42360183596611023,
"learning_rate": 9.905335628227194e-05,
"loss": 2.3124,
"step": 197
},
{
"epoch": 0.06010016694490818,
"grad_norm": 0.3650914430618286,
"learning_rate": 9.904829401640174e-05,
"loss": 1.8778,
"step": 198
},
{
"epoch": 0.06040370314159964,
"grad_norm": 0.392995148897171,
"learning_rate": 9.904323175053155e-05,
"loss": 2.16,
"step": 199
},
{
"epoch": 0.06070723933829109,
"grad_norm": 0.46390387415885925,
"learning_rate": 9.903816948466134e-05,
"loss": 1.8695,
"step": 200
},
{
"epoch": 0.06101077553498255,
"grad_norm": 0.3954870402812958,
"learning_rate": 9.903310721879114e-05,
"loss": 1.9233,
"step": 201
},
{
"epoch": 0.061314311731674,
"grad_norm": 0.3650193214416504,
"learning_rate": 9.902804495292093e-05,
"loss": 2.2504,
"step": 202
},
{
"epoch": 0.06161784792836546,
"grad_norm": 0.3582104742527008,
"learning_rate": 9.902298268705073e-05,
"loss": 1.9303,
"step": 203
},
{
"epoch": 0.06192138412505691,
"grad_norm": 0.35688868165016174,
"learning_rate": 9.901792042118052e-05,
"loss": 1.7078,
"step": 204
},
{
"epoch": 0.06222492032174837,
"grad_norm": 0.3666802942752838,
"learning_rate": 9.901285815531031e-05,
"loss": 1.941,
"step": 205
},
{
"epoch": 0.06252845651843983,
"grad_norm": 0.42375093698501587,
"learning_rate": 9.900779588944011e-05,
"loss": 2.0858,
"step": 206
},
{
"epoch": 0.06283199271513128,
"grad_norm": 0.3913770318031311,
"learning_rate": 9.90027336235699e-05,
"loss": 2.1423,
"step": 207
},
{
"epoch": 0.06313552891182274,
"grad_norm": 0.4101809859275818,
"learning_rate": 9.89976713576997e-05,
"loss": 2.0497,
"step": 208
},
{
"epoch": 0.06343906510851419,
"grad_norm": 0.3696439564228058,
"learning_rate": 9.899260909182951e-05,
"loss": 1.9692,
"step": 209
},
{
"epoch": 0.06374260130520565,
"grad_norm": 0.3725574016571045,
"learning_rate": 9.89875468259593e-05,
"loss": 2.2053,
"step": 210
},
{
"epoch": 0.0640461375018971,
"grad_norm": 0.4886903166770935,
"learning_rate": 9.898248456008911e-05,
"loss": 1.8981,
"step": 211
},
{
"epoch": 0.06434967369858856,
"grad_norm": 0.4423249661922455,
"learning_rate": 9.89774222942189e-05,
"loss": 1.9058,
"step": 212
},
{
"epoch": 0.06465320989528,
"grad_norm": 0.4045765697956085,
"learning_rate": 9.89723600283487e-05,
"loss": 1.8056,
"step": 213
},
{
"epoch": 0.06495674609197147,
"grad_norm": 0.43866047263145447,
"learning_rate": 9.89672977624785e-05,
"loss": 1.6315,
"step": 214
},
{
"epoch": 0.06526028228866293,
"grad_norm": 0.524714469909668,
"learning_rate": 9.896223549660829e-05,
"loss": 2.0156,
"step": 215
},
{
"epoch": 0.06556381848535438,
"grad_norm": 0.3752996325492859,
"learning_rate": 9.895717323073808e-05,
"loss": 2.2768,
"step": 216
},
{
"epoch": 0.06586735468204584,
"grad_norm": 0.4371670186519623,
"learning_rate": 9.895211096486788e-05,
"loss": 2.0755,
"step": 217
},
{
"epoch": 0.06617089087873729,
"grad_norm": 0.3751063644886017,
"learning_rate": 9.894704869899767e-05,
"loss": 2.2451,
"step": 218
},
{
"epoch": 0.06647442707542875,
"grad_norm": 0.6649600267410278,
"learning_rate": 9.894198643312747e-05,
"loss": 1.9835,
"step": 219
},
{
"epoch": 0.0667779632721202,
"grad_norm": 0.3941735625267029,
"learning_rate": 9.893692416725728e-05,
"loss": 2.0203,
"step": 220
},
{
"epoch": 0.06708149946881166,
"grad_norm": 0.41888293623924255,
"learning_rate": 9.893186190138707e-05,
"loss": 1.7572,
"step": 221
},
{
"epoch": 0.0673850356655031,
"grad_norm": 0.4820149838924408,
"learning_rate": 9.892679963551687e-05,
"loss": 2.0591,
"step": 222
},
{
"epoch": 0.06768857186219457,
"grad_norm": 0.3516736626625061,
"learning_rate": 9.892173736964666e-05,
"loss": 1.9398,
"step": 223
},
{
"epoch": 0.06799210805888602,
"grad_norm": 0.3873218894004822,
"learning_rate": 9.891667510377646e-05,
"loss": 1.6389,
"step": 224
},
{
"epoch": 0.06829564425557748,
"grad_norm": 0.3793487846851349,
"learning_rate": 9.891161283790625e-05,
"loss": 2.0075,
"step": 225
},
{
"epoch": 0.06859918045226893,
"grad_norm": 0.38987675309181213,
"learning_rate": 9.890655057203605e-05,
"loss": 2.0903,
"step": 226
},
{
"epoch": 0.06890271664896039,
"grad_norm": 0.4293549358844757,
"learning_rate": 9.890148830616584e-05,
"loss": 2.2099,
"step": 227
},
{
"epoch": 0.06920625284565185,
"grad_norm": 0.39895692467689514,
"learning_rate": 9.889642604029564e-05,
"loss": 1.8615,
"step": 228
},
{
"epoch": 0.0695097890423433,
"grad_norm": 0.4543936252593994,
"learning_rate": 9.889136377442544e-05,
"loss": 2.0828,
"step": 229
},
{
"epoch": 0.06981332523903476,
"grad_norm": 0.448477566242218,
"learning_rate": 9.888630150855524e-05,
"loss": 1.5524,
"step": 230
},
{
"epoch": 0.07011686143572621,
"grad_norm": 0.428975373506546,
"learning_rate": 9.888123924268503e-05,
"loss": 1.3828,
"step": 231
},
{
"epoch": 0.07042039763241767,
"grad_norm": 0.42287349700927734,
"learning_rate": 9.887617697681483e-05,
"loss": 2.096,
"step": 232
},
{
"epoch": 0.07072393382910912,
"grad_norm": 0.43614649772644043,
"learning_rate": 9.887111471094462e-05,
"loss": 1.8238,
"step": 233
},
{
"epoch": 0.07102747002580058,
"grad_norm": 0.47309553623199463,
"learning_rate": 9.886605244507442e-05,
"loss": 2.3526,
"step": 234
},
{
"epoch": 0.07133100622249203,
"grad_norm": 0.9558483362197876,
"learning_rate": 9.886099017920421e-05,
"loss": 1.9816,
"step": 235
},
{
"epoch": 0.07163454241918349,
"grad_norm": 0.3529858887195587,
"learning_rate": 9.885592791333401e-05,
"loss": 2.0314,
"step": 236
},
{
"epoch": 0.07193807861587494,
"grad_norm": 0.37652599811553955,
"learning_rate": 9.88508656474638e-05,
"loss": 1.9381,
"step": 237
},
{
"epoch": 0.0722416148125664,
"grad_norm": 0.40783143043518066,
"learning_rate": 9.884580338159361e-05,
"loss": 1.966,
"step": 238
},
{
"epoch": 0.07254515100925786,
"grad_norm": 0.4160328805446625,
"learning_rate": 9.88407411157234e-05,
"loss": 1.8176,
"step": 239
},
{
"epoch": 0.07284868720594931,
"grad_norm": 0.4397304952144623,
"learning_rate": 9.88356788498532e-05,
"loss": 1.6766,
"step": 240
},
{
"epoch": 0.07315222340264077,
"grad_norm": 0.42549702525138855,
"learning_rate": 9.8830616583983e-05,
"loss": 2.1176,
"step": 241
},
{
"epoch": 0.07345575959933222,
"grad_norm": 0.3747939169406891,
"learning_rate": 9.882555431811279e-05,
"loss": 1.5494,
"step": 242
},
{
"epoch": 0.07375929579602368,
"grad_norm": 3.4551990032196045,
"learning_rate": 9.882049205224258e-05,
"loss": 2.0336,
"step": 243
},
{
"epoch": 0.07406283199271513,
"grad_norm": 1.5632964372634888,
"learning_rate": 9.881542978637238e-05,
"loss": 1.7452,
"step": 244
},
{
"epoch": 0.07436636818940659,
"grad_norm": 0.41575855016708374,
"learning_rate": 9.881036752050217e-05,
"loss": 2.0243,
"step": 245
},
{
"epoch": 0.07466990438609804,
"grad_norm": 0.44168713688850403,
"learning_rate": 9.880530525463197e-05,
"loss": 2.0022,
"step": 246
},
{
"epoch": 0.0749734405827895,
"grad_norm": 0.46640321612358093,
"learning_rate": 9.880024298876176e-05,
"loss": 1.555,
"step": 247
},
{
"epoch": 0.07527697677948095,
"grad_norm": 0.3622835576534271,
"learning_rate": 9.879518072289157e-05,
"loss": 1.876,
"step": 248
},
{
"epoch": 0.07558051297617241,
"grad_norm": 0.6277987957000732,
"learning_rate": 9.879011845702137e-05,
"loss": 2.2753,
"step": 249
},
{
"epoch": 0.07588404917286386,
"grad_norm": 0.40246644616127014,
"learning_rate": 9.878505619115116e-05,
"loss": 1.5991,
"step": 250
},
{
"epoch": 0.07618758536955532,
"grad_norm": 0.38388529419898987,
"learning_rate": 9.877999392528096e-05,
"loss": 1.9226,
"step": 251
},
{
"epoch": 0.07649112156624678,
"grad_norm": 0.39985090494155884,
"learning_rate": 9.877493165941075e-05,
"loss": 2.0722,
"step": 252
},
{
"epoch": 0.07679465776293823,
"grad_norm": 0.3872128427028656,
"learning_rate": 9.876986939354055e-05,
"loss": 1.9132,
"step": 253
},
{
"epoch": 0.07709819395962969,
"grad_norm": 0.3665171265602112,
"learning_rate": 9.876480712767034e-05,
"loss": 1.6244,
"step": 254
},
{
"epoch": 0.07740173015632114,
"grad_norm": 0.4011310040950775,
"learning_rate": 9.875974486180015e-05,
"loss": 2.1289,
"step": 255
},
{
"epoch": 0.0777052663530126,
"grad_norm": 0.35013166069984436,
"learning_rate": 9.875468259592994e-05,
"loss": 1.9738,
"step": 256
},
{
"epoch": 0.07800880254970405,
"grad_norm": 0.48468607664108276,
"learning_rate": 9.874962033005974e-05,
"loss": 2.1368,
"step": 257
},
{
"epoch": 0.07831233874639551,
"grad_norm": 0.5015551447868347,
"learning_rate": 9.874455806418953e-05,
"loss": 2.1218,
"step": 258
},
{
"epoch": 0.07861587494308696,
"grad_norm": 0.41915133595466614,
"learning_rate": 9.873949579831934e-05,
"loss": 2.0052,
"step": 259
},
{
"epoch": 0.07891941113977842,
"grad_norm": 0.4414760172367096,
"learning_rate": 9.873443353244914e-05,
"loss": 1.7249,
"step": 260
},
{
"epoch": 0.07922294733646987,
"grad_norm": 0.47259169816970825,
"learning_rate": 9.872937126657893e-05,
"loss": 2.1041,
"step": 261
},
{
"epoch": 0.07952648353316133,
"grad_norm": 0.3689124882221222,
"learning_rate": 9.872430900070873e-05,
"loss": 1.8956,
"step": 262
},
{
"epoch": 0.07983001972985279,
"grad_norm": 0.3948320150375366,
"learning_rate": 9.871924673483852e-05,
"loss": 1.9211,
"step": 263
},
{
"epoch": 0.08013355592654424,
"grad_norm": 0.4235248267650604,
"learning_rate": 9.871418446896832e-05,
"loss": 1.7115,
"step": 264
},
{
"epoch": 0.0804370921232357,
"grad_norm": 0.48399198055267334,
"learning_rate": 9.870912220309811e-05,
"loss": 1.77,
"step": 265
},
{
"epoch": 0.08074062831992715,
"grad_norm": 0.34047526121139526,
"learning_rate": 9.87040599372279e-05,
"loss": 1.7189,
"step": 266
},
{
"epoch": 0.08104416451661861,
"grad_norm": 0.47203269600868225,
"learning_rate": 9.86989976713577e-05,
"loss": 1.7674,
"step": 267
},
{
"epoch": 0.08134770071331006,
"grad_norm": 0.3752756118774414,
"learning_rate": 9.869393540548751e-05,
"loss": 1.8716,
"step": 268
},
{
"epoch": 0.08165123691000152,
"grad_norm": 0.3437153697013855,
"learning_rate": 9.86888731396173e-05,
"loss": 1.9824,
"step": 269
},
{
"epoch": 0.08195477310669297,
"grad_norm": 0.4854094088077545,
"learning_rate": 9.86838108737471e-05,
"loss": 1.4385,
"step": 270
},
{
"epoch": 0.08225830930338443,
"grad_norm": 0.37674829363822937,
"learning_rate": 9.86787486078769e-05,
"loss": 1.7877,
"step": 271
},
{
"epoch": 0.08256184550007588,
"grad_norm": 0.4215140640735626,
"learning_rate": 9.867368634200669e-05,
"loss": 2.1854,
"step": 272
},
{
"epoch": 0.08286538169676734,
"grad_norm": 0.3680359423160553,
"learning_rate": 9.866862407613648e-05,
"loss": 2.104,
"step": 273
},
{
"epoch": 0.08316891789345879,
"grad_norm": 0.4195649325847626,
"learning_rate": 9.866356181026628e-05,
"loss": 1.469,
"step": 274
},
{
"epoch": 0.08347245409015025,
"grad_norm": 0.480640709400177,
"learning_rate": 9.865849954439607e-05,
"loss": 1.8329,
"step": 275
},
{
"epoch": 0.08377599028684171,
"grad_norm": 0.34760695695877075,
"learning_rate": 9.865343727852587e-05,
"loss": 1.9495,
"step": 276
},
{
"epoch": 0.08407952648353316,
"grad_norm": 0.3803161680698395,
"learning_rate": 9.864837501265568e-05,
"loss": 1.9294,
"step": 277
},
{
"epoch": 0.08438306268022462,
"grad_norm": 0.41739675402641296,
"learning_rate": 9.864331274678547e-05,
"loss": 2.059,
"step": 278
},
{
"epoch": 0.08468659887691607,
"grad_norm": 0.3807448744773865,
"learning_rate": 9.863825048091527e-05,
"loss": 1.9741,
"step": 279
},
{
"epoch": 0.08499013507360753,
"grad_norm": 0.3610997200012207,
"learning_rate": 9.863318821504506e-05,
"loss": 1.9815,
"step": 280
},
{
"epoch": 0.08529367127029898,
"grad_norm": 0.3797460198402405,
"learning_rate": 9.862812594917485e-05,
"loss": 2.1394,
"step": 281
},
{
"epoch": 0.08559720746699044,
"grad_norm": 0.3922887444496155,
"learning_rate": 9.862306368330465e-05,
"loss": 2.184,
"step": 282
},
{
"epoch": 0.08590074366368189,
"grad_norm": 0.38251930475234985,
"learning_rate": 9.861800141743444e-05,
"loss": 2.0186,
"step": 283
},
{
"epoch": 0.08620427986037335,
"grad_norm": 0.35968562960624695,
"learning_rate": 9.861293915156424e-05,
"loss": 2.0,
"step": 284
},
{
"epoch": 0.0865078160570648,
"grad_norm": 0.37149590253829956,
"learning_rate": 9.860787688569403e-05,
"loss": 1.7941,
"step": 285
},
{
"epoch": 0.08681135225375626,
"grad_norm": 0.36890628933906555,
"learning_rate": 9.860281461982383e-05,
"loss": 1.906,
"step": 286
},
{
"epoch": 0.08711488845044772,
"grad_norm": 0.36025917530059814,
"learning_rate": 9.859775235395364e-05,
"loss": 1.9655,
"step": 287
},
{
"epoch": 0.08741842464713917,
"grad_norm": 0.3704364001750946,
"learning_rate": 9.859269008808343e-05,
"loss": 1.8657,
"step": 288
},
{
"epoch": 0.08772196084383063,
"grad_norm": 0.5996513962745667,
"learning_rate": 9.858762782221323e-05,
"loss": 1.7448,
"step": 289
},
{
"epoch": 0.08802549704052208,
"grad_norm": 0.3615630269050598,
"learning_rate": 9.858256555634302e-05,
"loss": 1.9007,
"step": 290
},
{
"epoch": 0.08832903323721354,
"grad_norm": 0.36014246940612793,
"learning_rate": 9.857750329047282e-05,
"loss": 1.927,
"step": 291
},
{
"epoch": 0.08863256943390499,
"grad_norm": 0.5038754940032959,
"learning_rate": 9.857244102460261e-05,
"loss": 1.6613,
"step": 292
},
{
"epoch": 0.08893610563059645,
"grad_norm": 0.3880213797092438,
"learning_rate": 9.85673787587324e-05,
"loss": 1.5563,
"step": 293
},
{
"epoch": 0.0892396418272879,
"grad_norm": 0.43225082755088806,
"learning_rate": 9.85623164928622e-05,
"loss": 1.5534,
"step": 294
},
{
"epoch": 0.08954317802397936,
"grad_norm": 0.44342055916786194,
"learning_rate": 9.8557254226992e-05,
"loss": 1.6211,
"step": 295
},
{
"epoch": 0.08984671422067081,
"grad_norm": 0.42114123702049255,
"learning_rate": 9.85521919611218e-05,
"loss": 1.9731,
"step": 296
},
{
"epoch": 0.09015025041736227,
"grad_norm": 0.43151113390922546,
"learning_rate": 9.85471296952516e-05,
"loss": 1.9519,
"step": 297
},
{
"epoch": 0.09045378661405373,
"grad_norm": 0.38092517852783203,
"learning_rate": 9.85420674293814e-05,
"loss": 2.0973,
"step": 298
},
{
"epoch": 0.09075732281074518,
"grad_norm": 0.40729570388793945,
"learning_rate": 9.853700516351119e-05,
"loss": 1.4395,
"step": 299
},
{
"epoch": 0.09106085900743664,
"grad_norm": 0.3631846308708191,
"learning_rate": 9.8531942897641e-05,
"loss": 1.2255,
"step": 300
},
{
"epoch": 0.09136439520412809,
"grad_norm": 0.37764397263526917,
"learning_rate": 9.852688063177079e-05,
"loss": 1.9941,
"step": 301
},
{
"epoch": 0.09166793140081955,
"grad_norm": 0.3755379319190979,
"learning_rate": 9.852181836590059e-05,
"loss": 1.7154,
"step": 302
},
{
"epoch": 0.091971467597511,
"grad_norm": 0.39003854990005493,
"learning_rate": 9.851675610003038e-05,
"loss": 1.928,
"step": 303
},
{
"epoch": 0.09227500379420246,
"grad_norm": 0.39592432975769043,
"learning_rate": 9.851169383416018e-05,
"loss": 2.1913,
"step": 304
},
{
"epoch": 0.09257853999089391,
"grad_norm": 0.4315894842147827,
"learning_rate": 9.850663156828997e-05,
"loss": 1.6432,
"step": 305
},
{
"epoch": 0.09288207618758537,
"grad_norm": 0.4103511571884155,
"learning_rate": 9.850156930241977e-05,
"loss": 1.9944,
"step": 306
},
{
"epoch": 0.09318561238427682,
"grad_norm": 0.4236547350883484,
"learning_rate": 9.849650703654957e-05,
"loss": 1.875,
"step": 307
},
{
"epoch": 0.09348914858096828,
"grad_norm": 0.41012468934059143,
"learning_rate": 9.849144477067937e-05,
"loss": 2.008,
"step": 308
},
{
"epoch": 0.09379268477765973,
"grad_norm": 0.35538622736930847,
"learning_rate": 9.848638250480916e-05,
"loss": 1.7322,
"step": 309
},
{
"epoch": 0.09409622097435119,
"grad_norm": 0.3874755799770355,
"learning_rate": 9.848132023893896e-05,
"loss": 1.9818,
"step": 310
},
{
"epoch": 0.09439975717104265,
"grad_norm": 0.42444977164268494,
"learning_rate": 9.847625797306875e-05,
"loss": 2.1606,
"step": 311
},
{
"epoch": 0.0947032933677341,
"grad_norm": 0.5855305194854736,
"learning_rate": 9.847119570719855e-05,
"loss": 1.4887,
"step": 312
},
{
"epoch": 0.09500682956442556,
"grad_norm": 0.35223227739334106,
"learning_rate": 9.846613344132834e-05,
"loss": 2.0025,
"step": 313
},
{
"epoch": 0.09531036576111701,
"grad_norm": 0.4013148844242096,
"learning_rate": 9.846107117545814e-05,
"loss": 1.9702,
"step": 314
},
{
"epoch": 0.09561390195780847,
"grad_norm": 0.5038349032402039,
"learning_rate": 9.845600890958793e-05,
"loss": 2.1532,
"step": 315
},
{
"epoch": 0.09591743815449992,
"grad_norm": 0.4826093018054962,
"learning_rate": 9.845094664371774e-05,
"loss": 2.0118,
"step": 316
},
{
"epoch": 0.09622097435119138,
"grad_norm": 0.41135913133621216,
"learning_rate": 9.844588437784754e-05,
"loss": 2.0707,
"step": 317
},
{
"epoch": 0.09652451054788283,
"grad_norm": 0.4353053569793701,
"learning_rate": 9.844082211197733e-05,
"loss": 2.104,
"step": 318
},
{
"epoch": 0.09682804674457429,
"grad_norm": 0.4192908704280853,
"learning_rate": 9.843575984610712e-05,
"loss": 1.9489,
"step": 319
},
{
"epoch": 0.09713158294126574,
"grad_norm": 0.380562424659729,
"learning_rate": 9.843069758023692e-05,
"loss": 1.3602,
"step": 320
},
{
"epoch": 0.0974351191379572,
"grad_norm": 0.3394995331764221,
"learning_rate": 9.842563531436671e-05,
"loss": 2.2161,
"step": 321
},
{
"epoch": 0.09773865533464866,
"grad_norm": 0.3419237434864044,
"learning_rate": 9.842057304849651e-05,
"loss": 1.7146,
"step": 322
},
{
"epoch": 0.09804219153134011,
"grad_norm": 0.3590264618396759,
"learning_rate": 9.84155107826263e-05,
"loss": 1.8654,
"step": 323
},
{
"epoch": 0.09834572772803157,
"grad_norm": 0.40006300806999207,
"learning_rate": 9.84104485167561e-05,
"loss": 1.5787,
"step": 324
},
{
"epoch": 0.09864926392472302,
"grad_norm": 0.33313074707984924,
"learning_rate": 9.84053862508859e-05,
"loss": 1.8653,
"step": 325
},
{
"epoch": 0.09895280012141448,
"grad_norm": 0.39681655168533325,
"learning_rate": 9.84003239850157e-05,
"loss": 2.178,
"step": 326
},
{
"epoch": 0.09925633631810593,
"grad_norm": 0.41945868730545044,
"learning_rate": 9.83952617191455e-05,
"loss": 1.8324,
"step": 327
},
{
"epoch": 0.09955987251479739,
"grad_norm": 0.3957304060459137,
"learning_rate": 9.839019945327529e-05,
"loss": 1.6468,
"step": 328
},
{
"epoch": 0.09986340871148884,
"grad_norm": 0.35814937949180603,
"learning_rate": 9.838513718740509e-05,
"loss": 1.6492,
"step": 329
},
{
"epoch": 0.1001669449081803,
"grad_norm": 0.38410916924476624,
"learning_rate": 9.838007492153488e-05,
"loss": 1.7223,
"step": 330
},
{
"epoch": 0.10047048110487175,
"grad_norm": 0.38490885496139526,
"learning_rate": 9.837501265566468e-05,
"loss": 2.0166,
"step": 331
},
{
"epoch": 0.10077401730156321,
"grad_norm": 0.38943415880203247,
"learning_rate": 9.836995038979447e-05,
"loss": 1.371,
"step": 332
},
{
"epoch": 0.10107755349825466,
"grad_norm": 0.39741018414497375,
"learning_rate": 9.836488812392427e-05,
"loss": 1.6233,
"step": 333
},
{
"epoch": 0.10138108969494612,
"grad_norm": 0.4663957357406616,
"learning_rate": 9.835982585805406e-05,
"loss": 1.746,
"step": 334
},
{
"epoch": 0.10168462589163758,
"grad_norm": 0.37118905782699585,
"learning_rate": 9.835476359218387e-05,
"loss": 1.9684,
"step": 335
},
{
"epoch": 0.10198816208832903,
"grad_norm": 0.40275588631629944,
"learning_rate": 9.834970132631366e-05,
"loss": 1.9551,
"step": 336
},
{
"epoch": 0.1022916982850205,
"grad_norm": 0.4336283206939697,
"learning_rate": 9.834463906044346e-05,
"loss": 2.0711,
"step": 337
},
{
"epoch": 0.10259523448171194,
"grad_norm": 0.35735735297203064,
"learning_rate": 9.833957679457325e-05,
"loss": 2.1397,
"step": 338
},
{
"epoch": 0.1028987706784034,
"grad_norm": 0.37825390696525574,
"learning_rate": 9.833451452870305e-05,
"loss": 1.7494,
"step": 339
},
{
"epoch": 0.10320230687509485,
"grad_norm": 0.3384961783885956,
"learning_rate": 9.832945226283284e-05,
"loss": 2.0197,
"step": 340
},
{
"epoch": 0.10350584307178631,
"grad_norm": 0.46276888251304626,
"learning_rate": 9.832438999696264e-05,
"loss": 1.797,
"step": 341
},
{
"epoch": 0.10380937926847776,
"grad_norm": 0.3685421347618103,
"learning_rate": 9.831932773109243e-05,
"loss": 1.9301,
"step": 342
},
{
"epoch": 0.10411291546516922,
"grad_norm": 0.38931936025619507,
"learning_rate": 9.831426546522223e-05,
"loss": 1.9623,
"step": 343
},
{
"epoch": 0.10441645166186067,
"grad_norm": 0.46678805351257324,
"learning_rate": 9.830920319935204e-05,
"loss": 1.6708,
"step": 344
},
{
"epoch": 0.10471998785855213,
"grad_norm": 0.4199204444885254,
"learning_rate": 9.830414093348183e-05,
"loss": 1.8014,
"step": 345
},
{
"epoch": 0.1050235240552436,
"grad_norm": 0.41024506092071533,
"learning_rate": 9.829907866761164e-05,
"loss": 1.8829,
"step": 346
},
{
"epoch": 0.10532706025193504,
"grad_norm": 0.5271286368370056,
"learning_rate": 9.829401640174143e-05,
"loss": 1.7796,
"step": 347
},
{
"epoch": 0.1056305964486265,
"grad_norm": 0.3593878448009491,
"learning_rate": 9.828895413587123e-05,
"loss": 2.0697,
"step": 348
},
{
"epoch": 0.10593413264531795,
"grad_norm": 0.44404372572898865,
"learning_rate": 9.828389187000102e-05,
"loss": 2.3235,
"step": 349
},
{
"epoch": 0.10623766884200941,
"grad_norm": 0.4072231650352478,
"learning_rate": 9.827882960413082e-05,
"loss": 1.5391,
"step": 350
},
{
"epoch": 0.10654120503870086,
"grad_norm": 0.3924303352832794,
"learning_rate": 9.827376733826061e-05,
"loss": 2.0649,
"step": 351
},
{
"epoch": 0.10684474123539232,
"grad_norm": 0.3815264105796814,
"learning_rate": 9.826870507239041e-05,
"loss": 1.5821,
"step": 352
},
{
"epoch": 0.10714827743208377,
"grad_norm": 0.40832409262657166,
"learning_rate": 9.82636428065202e-05,
"loss": 2.1135,
"step": 353
},
{
"epoch": 0.10745181362877523,
"grad_norm": 0.40270155668258667,
"learning_rate": 9.825858054065e-05,
"loss": 1.6561,
"step": 354
},
{
"epoch": 0.10775534982546668,
"grad_norm": 0.38295283913612366,
"learning_rate": 9.82535182747798e-05,
"loss": 1.8938,
"step": 355
},
{
"epoch": 0.10805888602215814,
"grad_norm": 0.41975417733192444,
"learning_rate": 9.82484560089096e-05,
"loss": 1.8605,
"step": 356
},
{
"epoch": 0.10836242221884959,
"grad_norm": 0.41388946771621704,
"learning_rate": 9.82433937430394e-05,
"loss": 1.812,
"step": 357
},
{
"epoch": 0.10866595841554105,
"grad_norm": 0.3470607101917267,
"learning_rate": 9.823833147716919e-05,
"loss": 2.1914,
"step": 358
},
{
"epoch": 0.10896949461223251,
"grad_norm": 0.4417155385017395,
"learning_rate": 9.823326921129898e-05,
"loss": 1.7644,
"step": 359
},
{
"epoch": 0.10927303080892396,
"grad_norm": 0.33910539746284485,
"learning_rate": 9.822820694542878e-05,
"loss": 1.8821,
"step": 360
},
{
"epoch": 0.10957656700561542,
"grad_norm": 0.36742356419563293,
"learning_rate": 9.822314467955857e-05,
"loss": 1.9684,
"step": 361
},
{
"epoch": 0.10988010320230687,
"grad_norm": 0.407844603061676,
"learning_rate": 9.821808241368837e-05,
"loss": 1.8797,
"step": 362
},
{
"epoch": 0.11018363939899833,
"grad_norm": 0.4090898036956787,
"learning_rate": 9.821302014781816e-05,
"loss": 1.8401,
"step": 363
},
{
"epoch": 0.11048717559568978,
"grad_norm": 0.3852720260620117,
"learning_rate": 9.820795788194796e-05,
"loss": 1.6887,
"step": 364
},
{
"epoch": 0.11079071179238124,
"grad_norm": 0.4147186875343323,
"learning_rate": 9.820289561607777e-05,
"loss": 1.7263,
"step": 365
},
{
"epoch": 0.11109424798907269,
"grad_norm": 0.7032086849212646,
"learning_rate": 9.819783335020756e-05,
"loss": 1.5382,
"step": 366
},
{
"epoch": 0.11139778418576415,
"grad_norm": 0.3547534644603729,
"learning_rate": 9.819277108433736e-05,
"loss": 1.5988,
"step": 367
},
{
"epoch": 0.1117013203824556,
"grad_norm": 0.45878785848617554,
"learning_rate": 9.818770881846715e-05,
"loss": 2.2467,
"step": 368
},
{
"epoch": 0.11200485657914706,
"grad_norm": 0.39183077216148376,
"learning_rate": 9.818264655259695e-05,
"loss": 1.848,
"step": 369
},
{
"epoch": 0.11230839277583853,
"grad_norm": 0.3735283315181732,
"learning_rate": 9.817758428672674e-05,
"loss": 1.6925,
"step": 370
},
{
"epoch": 0.11261192897252997,
"grad_norm": 0.3878265917301178,
"learning_rate": 9.817252202085654e-05,
"loss": 2.04,
"step": 371
},
{
"epoch": 0.11291546516922144,
"grad_norm": 0.38978812098503113,
"learning_rate": 9.816745975498633e-05,
"loss": 1.869,
"step": 372
},
{
"epoch": 0.11321900136591288,
"grad_norm": 0.39212337136268616,
"learning_rate": 9.816239748911613e-05,
"loss": 2.0549,
"step": 373
},
{
"epoch": 0.11352253756260434,
"grad_norm": 0.39528506994247437,
"learning_rate": 9.815733522324593e-05,
"loss": 1.5653,
"step": 374
},
{
"epoch": 0.11382607375929579,
"grad_norm": 0.4226018786430359,
"learning_rate": 9.815227295737573e-05,
"loss": 1.6231,
"step": 375
},
{
"epoch": 0.11412960995598725,
"grad_norm": 0.3577810823917389,
"learning_rate": 9.814721069150552e-05,
"loss": 1.9599,
"step": 376
},
{
"epoch": 0.1144331461526787,
"grad_norm": 0.33580708503723145,
"learning_rate": 9.814214842563532e-05,
"loss": 2.0419,
"step": 377
},
{
"epoch": 0.11473668234937016,
"grad_norm": 0.38860392570495605,
"learning_rate": 9.813708615976511e-05,
"loss": 1.7186,
"step": 378
},
{
"epoch": 0.11504021854606161,
"grad_norm": 0.38994479179382324,
"learning_rate": 9.813202389389491e-05,
"loss": 2.1848,
"step": 379
},
{
"epoch": 0.11534375474275307,
"grad_norm": 0.3947262763977051,
"learning_rate": 9.81269616280247e-05,
"loss": 2.1868,
"step": 380
},
{
"epoch": 0.11564729093944452,
"grad_norm": 0.3112877607345581,
"learning_rate": 9.81218993621545e-05,
"loss": 1.8604,
"step": 381
},
{
"epoch": 0.11595082713613598,
"grad_norm": 0.375689834356308,
"learning_rate": 9.811683709628429e-05,
"loss": 2.0418,
"step": 382
},
{
"epoch": 0.11625436333282745,
"grad_norm": 0.34537243843078613,
"learning_rate": 9.81117748304141e-05,
"loss": 1.8874,
"step": 383
},
{
"epoch": 0.1165578995295189,
"grad_norm": 0.5077370405197144,
"learning_rate": 9.81067125645439e-05,
"loss": 1.7497,
"step": 384
},
{
"epoch": 0.11686143572621036,
"grad_norm": 0.3703441023826599,
"learning_rate": 9.810165029867369e-05,
"loss": 1.781,
"step": 385
},
{
"epoch": 0.1171649719229018,
"grad_norm": 0.4386610984802246,
"learning_rate": 9.809658803280348e-05,
"loss": 1.8428,
"step": 386
},
{
"epoch": 0.11746850811959327,
"grad_norm": 0.37781745195388794,
"learning_rate": 9.809152576693328e-05,
"loss": 2.0384,
"step": 387
},
{
"epoch": 0.11777204431628471,
"grad_norm": 0.38956716656684875,
"learning_rate": 9.808646350106307e-05,
"loss": 2.3534,
"step": 388
},
{
"epoch": 0.11807558051297617,
"grad_norm": 0.3444838523864746,
"learning_rate": 9.808140123519288e-05,
"loss": 1.921,
"step": 389
},
{
"epoch": 0.11837911670966762,
"grad_norm": 0.39881742000579834,
"learning_rate": 9.807633896932268e-05,
"loss": 2.1758,
"step": 390
},
{
"epoch": 0.11868265290635908,
"grad_norm": 0.384226530790329,
"learning_rate": 9.807127670345247e-05,
"loss": 1.7651,
"step": 391
},
{
"epoch": 0.11898618910305053,
"grad_norm": 0.36255109310150146,
"learning_rate": 9.806621443758227e-05,
"loss": 1.8122,
"step": 392
},
{
"epoch": 0.119289725299742,
"grad_norm": 0.3627421259880066,
"learning_rate": 9.806115217171206e-05,
"loss": 1.6304,
"step": 393
},
{
"epoch": 0.11959326149643346,
"grad_norm": 0.8936781883239746,
"learning_rate": 9.805608990584187e-05,
"loss": 1.8827,
"step": 394
},
{
"epoch": 0.1198967976931249,
"grad_norm": 0.5008642673492432,
"learning_rate": 9.805102763997166e-05,
"loss": 1.3597,
"step": 395
},
{
"epoch": 0.12020033388981637,
"grad_norm": 0.4444289207458496,
"learning_rate": 9.804596537410146e-05,
"loss": 2.1768,
"step": 396
},
{
"epoch": 0.12050387008650781,
"grad_norm": 0.3963356912136078,
"learning_rate": 9.804090310823125e-05,
"loss": 1.8373,
"step": 397
},
{
"epoch": 0.12080740628319928,
"grad_norm": 0.44095271825790405,
"learning_rate": 9.803584084236105e-05,
"loss": 1.7893,
"step": 398
},
{
"epoch": 0.12111094247989072,
"grad_norm": 0.4162418246269226,
"learning_rate": 9.803077857649084e-05,
"loss": 1.7482,
"step": 399
},
{
"epoch": 0.12141447867658219,
"grad_norm": 0.3853035271167755,
"learning_rate": 9.802571631062064e-05,
"loss": 1.6274,
"step": 400
},
{
"epoch": 0.12171801487327363,
"grad_norm": 1.1697463989257812,
"learning_rate": 9.802065404475043e-05,
"loss": 2.2254,
"step": 401
},
{
"epoch": 0.1220215510699651,
"grad_norm": 0.3899803161621094,
"learning_rate": 9.801559177888023e-05,
"loss": 1.9754,
"step": 402
},
{
"epoch": 0.12232508726665654,
"grad_norm": 0.43946412205696106,
"learning_rate": 9.801052951301002e-05,
"loss": 2.1184,
"step": 403
},
{
"epoch": 0.122628623463348,
"grad_norm": 0.46882718801498413,
"learning_rate": 9.800546724713983e-05,
"loss": 1.4423,
"step": 404
},
{
"epoch": 0.12293215966003945,
"grad_norm": 0.4379485547542572,
"learning_rate": 9.800040498126963e-05,
"loss": 2.0614,
"step": 405
},
{
"epoch": 0.12323569585673091,
"grad_norm": 0.3837740123271942,
"learning_rate": 9.799534271539942e-05,
"loss": 1.9974,
"step": 406
},
{
"epoch": 0.12353923205342238,
"grad_norm": 0.35403695702552795,
"learning_rate": 9.799028044952922e-05,
"loss": 1.5693,
"step": 407
},
{
"epoch": 0.12384276825011382,
"grad_norm": 0.4070426821708679,
"learning_rate": 9.798521818365901e-05,
"loss": 1.8704,
"step": 408
},
{
"epoch": 0.12414630444680529,
"grad_norm": 0.4301077425479889,
"learning_rate": 9.79801559177888e-05,
"loss": 1.077,
"step": 409
},
{
"epoch": 0.12444984064349673,
"grad_norm": 0.37687429785728455,
"learning_rate": 9.79750936519186e-05,
"loss": 1.7323,
"step": 410
},
{
"epoch": 0.1247533768401882,
"grad_norm": 0.37393873929977417,
"learning_rate": 9.79700313860484e-05,
"loss": 1.9532,
"step": 411
},
{
"epoch": 0.12505691303687966,
"grad_norm": 0.4518846869468689,
"learning_rate": 9.796496912017819e-05,
"loss": 2.0123,
"step": 412
},
{
"epoch": 0.1253604492335711,
"grad_norm": 0.39417609572410583,
"learning_rate": 9.7959906854308e-05,
"loss": 2.2669,
"step": 413
},
{
"epoch": 0.12566398543026255,
"grad_norm": 0.3802976608276367,
"learning_rate": 9.795484458843779e-05,
"loss": 2.0506,
"step": 414
},
{
"epoch": 0.12596752162695402,
"grad_norm": 1.3118431568145752,
"learning_rate": 9.794978232256759e-05,
"loss": 2.2551,
"step": 415
},
{
"epoch": 0.12627105782364548,
"grad_norm": 0.9459638595581055,
"learning_rate": 9.794472005669738e-05,
"loss": 1.7829,
"step": 416
},
{
"epoch": 0.1265745940203369,
"grad_norm": 0.571232795715332,
"learning_rate": 9.793965779082718e-05,
"loss": 1.7768,
"step": 417
},
{
"epoch": 0.12687813021702837,
"grad_norm": 0.3973385989665985,
"learning_rate": 9.793459552495697e-05,
"loss": 1.88,
"step": 418
},
{
"epoch": 0.12718166641371983,
"grad_norm": 0.3883122503757477,
"learning_rate": 9.792953325908677e-05,
"loss": 1.9592,
"step": 419
},
{
"epoch": 0.1274852026104113,
"grad_norm": 0.40379586815834045,
"learning_rate": 9.792447099321656e-05,
"loss": 1.9697,
"step": 420
},
{
"epoch": 0.12778873880710276,
"grad_norm": 0.3288556635379791,
"learning_rate": 9.791940872734636e-05,
"loss": 1.7282,
"step": 421
},
{
"epoch": 0.1280922750037942,
"grad_norm": 0.3872746527194977,
"learning_rate": 9.791434646147616e-05,
"loss": 1.9348,
"step": 422
},
{
"epoch": 0.12839581120048565,
"grad_norm": 0.37058207392692566,
"learning_rate": 9.790928419560596e-05,
"loss": 1.5684,
"step": 423
},
{
"epoch": 0.12869934739717712,
"grad_norm": 0.37466561794281006,
"learning_rate": 9.790422192973575e-05,
"loss": 1.9535,
"step": 424
},
{
"epoch": 0.12900288359386858,
"grad_norm": 0.32176846265792847,
"learning_rate": 9.789915966386555e-05,
"loss": 1.8537,
"step": 425
},
{
"epoch": 0.12930641979056,
"grad_norm": 0.37653467059135437,
"learning_rate": 9.789409739799534e-05,
"loss": 2.0701,
"step": 426
},
{
"epoch": 0.12960995598725147,
"grad_norm": 0.38768434524536133,
"learning_rate": 9.788903513212514e-05,
"loss": 1.731,
"step": 427
},
{
"epoch": 0.12991349218394294,
"grad_norm": 0.5139635801315308,
"learning_rate": 9.788397286625493e-05,
"loss": 2.4437,
"step": 428
},
{
"epoch": 0.1302170283806344,
"grad_norm": 0.3759630024433136,
"learning_rate": 9.787891060038473e-05,
"loss": 2.0918,
"step": 429
},
{
"epoch": 0.13052056457732586,
"grad_norm": 0.3718818426132202,
"learning_rate": 9.787384833451452e-05,
"loss": 1.5854,
"step": 430
},
{
"epoch": 0.1308241007740173,
"grad_norm": 0.6460405588150024,
"learning_rate": 9.786878606864432e-05,
"loss": 2.2442,
"step": 431
},
{
"epoch": 0.13112763697070876,
"grad_norm": 0.40393388271331787,
"learning_rate": 9.786372380277413e-05,
"loss": 1.728,
"step": 432
},
{
"epoch": 0.13143117316740022,
"grad_norm": 0.3772658407688141,
"learning_rate": 9.785866153690393e-05,
"loss": 1.668,
"step": 433
},
{
"epoch": 0.13173470936409168,
"grad_norm": 2.5252649784088135,
"learning_rate": 9.785359927103373e-05,
"loss": 1.8864,
"step": 434
},
{
"epoch": 0.1320382455607831,
"grad_norm": 0.42327219247817993,
"learning_rate": 9.784853700516352e-05,
"loss": 2.3174,
"step": 435
},
{
"epoch": 0.13234178175747457,
"grad_norm": 0.3689473867416382,
"learning_rate": 9.784347473929332e-05,
"loss": 1.9671,
"step": 436
},
{
"epoch": 0.13264531795416604,
"grad_norm": 0.37554243206977844,
"learning_rate": 9.783841247342311e-05,
"loss": 1.783,
"step": 437
},
{
"epoch": 0.1329488541508575,
"grad_norm": 0.409587025642395,
"learning_rate": 9.783335020755291e-05,
"loss": 2.0385,
"step": 438
},
{
"epoch": 0.13325239034754893,
"grad_norm": 0.349252849817276,
"learning_rate": 9.78282879416827e-05,
"loss": 1.8785,
"step": 439
},
{
"epoch": 0.1335559265442404,
"grad_norm": 0.36687588691711426,
"learning_rate": 9.78232256758125e-05,
"loss": 2.1174,
"step": 440
},
{
"epoch": 0.13385946274093186,
"grad_norm": 0.40221846103668213,
"learning_rate": 9.781816340994229e-05,
"loss": 1.8385,
"step": 441
},
{
"epoch": 0.13416299893762332,
"grad_norm": 0.5634617805480957,
"learning_rate": 9.781310114407209e-05,
"loss": 1.9316,
"step": 442
},
{
"epoch": 0.13446653513431478,
"grad_norm": 0.37704020738601685,
"learning_rate": 9.78080388782019e-05,
"loss": 1.8865,
"step": 443
},
{
"epoch": 0.1347700713310062,
"grad_norm": 0.36043843626976013,
"learning_rate": 9.780297661233169e-05,
"loss": 1.585,
"step": 444
},
{
"epoch": 0.13507360752769768,
"grad_norm": 0.33643844723701477,
"learning_rate": 9.779791434646149e-05,
"loss": 1.8098,
"step": 445
},
{
"epoch": 0.13537714372438914,
"grad_norm": 0.6782101988792419,
"learning_rate": 9.779285208059128e-05,
"loss": 2.0468,
"step": 446
},
{
"epoch": 0.1356806799210806,
"grad_norm": 0.38101980090141296,
"learning_rate": 9.778778981472108e-05,
"loss": 2.0624,
"step": 447
},
{
"epoch": 0.13598421611777203,
"grad_norm": 0.399311900138855,
"learning_rate": 9.778272754885087e-05,
"loss": 2.1652,
"step": 448
},
{
"epoch": 0.1362877523144635,
"grad_norm": 0.3491426706314087,
"learning_rate": 9.777766528298066e-05,
"loss": 1.9092,
"step": 449
},
{
"epoch": 0.13659128851115496,
"grad_norm": 0.3654717803001404,
"learning_rate": 9.777260301711046e-05,
"loss": 1.9773,
"step": 450
},
{
"epoch": 0.13689482470784642,
"grad_norm": 0.394699364900589,
"learning_rate": 9.776754075124025e-05,
"loss": 2.1568,
"step": 451
},
{
"epoch": 0.13719836090453785,
"grad_norm": 0.3601212203502655,
"learning_rate": 9.776247848537006e-05,
"loss": 1.8744,
"step": 452
},
{
"epoch": 0.13750189710122931,
"grad_norm": 0.40716952085494995,
"learning_rate": 9.775741621949986e-05,
"loss": 2.1052,
"step": 453
},
{
"epoch": 0.13780543329792078,
"grad_norm": 0.37777504324913025,
"learning_rate": 9.775235395362965e-05,
"loss": 1.8896,
"step": 454
},
{
"epoch": 0.13810896949461224,
"grad_norm": 0.368600994348526,
"learning_rate": 9.774729168775945e-05,
"loss": 1.8285,
"step": 455
},
{
"epoch": 0.1384125056913037,
"grad_norm": 0.41742029786109924,
"learning_rate": 9.774222942188924e-05,
"loss": 1.8286,
"step": 456
},
{
"epoch": 0.13871604188799513,
"grad_norm": 0.40132156014442444,
"learning_rate": 9.773716715601904e-05,
"loss": 1.9515,
"step": 457
},
{
"epoch": 0.1390195780846866,
"grad_norm": 0.44473376870155334,
"learning_rate": 9.773210489014883e-05,
"loss": 1.8715,
"step": 458
},
{
"epoch": 0.13932311428137806,
"grad_norm": 0.40146371722221375,
"learning_rate": 9.772704262427863e-05,
"loss": 2.1469,
"step": 459
},
{
"epoch": 0.13962665047806952,
"grad_norm": 0.3863317370414734,
"learning_rate": 9.772198035840842e-05,
"loss": 1.9215,
"step": 460
},
{
"epoch": 0.13993018667476095,
"grad_norm": 0.40235334634780884,
"learning_rate": 9.771691809253823e-05,
"loss": 2.1276,
"step": 461
},
{
"epoch": 0.14023372287145242,
"grad_norm": 0.46011632680892944,
"learning_rate": 9.771185582666802e-05,
"loss": 1.244,
"step": 462
},
{
"epoch": 0.14053725906814388,
"grad_norm": 0.3428272008895874,
"learning_rate": 9.770679356079782e-05,
"loss": 1.7991,
"step": 463
},
{
"epoch": 0.14084079526483534,
"grad_norm": 0.39976757764816284,
"learning_rate": 9.770173129492761e-05,
"loss": 1.7166,
"step": 464
},
{
"epoch": 0.1411443314615268,
"grad_norm": 0.3258446753025055,
"learning_rate": 9.769666902905741e-05,
"loss": 1.677,
"step": 465
},
{
"epoch": 0.14144786765821823,
"grad_norm": 0.3950905501842499,
"learning_rate": 9.76916067631872e-05,
"loss": 2.0122,
"step": 466
},
{
"epoch": 0.1417514038549097,
"grad_norm": 0.39712047576904297,
"learning_rate": 9.7686544497317e-05,
"loss": 1.7262,
"step": 467
},
{
"epoch": 0.14205494005160116,
"grad_norm": 0.8331599235534668,
"learning_rate": 9.768148223144679e-05,
"loss": 1.9852,
"step": 468
},
{
"epoch": 0.14235847624829262,
"grad_norm": 0.3578427731990814,
"learning_rate": 9.767641996557659e-05,
"loss": 1.8249,
"step": 469
},
{
"epoch": 0.14266201244498405,
"grad_norm": 0.3736058473587036,
"learning_rate": 9.767135769970638e-05,
"loss": 1.43,
"step": 470
},
{
"epoch": 0.14296554864167552,
"grad_norm": 0.48153185844421387,
"learning_rate": 9.766629543383619e-05,
"loss": 1.8667,
"step": 471
},
{
"epoch": 0.14326908483836698,
"grad_norm": 0.3924524188041687,
"learning_rate": 9.766123316796599e-05,
"loss": 2.0385,
"step": 472
},
{
"epoch": 0.14357262103505844,
"grad_norm": 0.38956940174102783,
"learning_rate": 9.765617090209578e-05,
"loss": 1.3157,
"step": 473
},
{
"epoch": 0.14387615723174987,
"grad_norm": 0.4032903015613556,
"learning_rate": 9.765110863622558e-05,
"loss": 1.8793,
"step": 474
},
{
"epoch": 0.14417969342844134,
"grad_norm": 0.5116568207740784,
"learning_rate": 9.764604637035537e-05,
"loss": 1.7658,
"step": 475
},
{
"epoch": 0.1444832296251328,
"grad_norm": 0.3981756269931793,
"learning_rate": 9.764098410448517e-05,
"loss": 1.8087,
"step": 476
},
{
"epoch": 0.14478676582182426,
"grad_norm": 0.43181854486465454,
"learning_rate": 9.763592183861496e-05,
"loss": 1.5241,
"step": 477
},
{
"epoch": 0.14509030201851572,
"grad_norm": 0.4172961413860321,
"learning_rate": 9.763085957274477e-05,
"loss": 1.8318,
"step": 478
},
{
"epoch": 0.14539383821520716,
"grad_norm": 0.4135033190250397,
"learning_rate": 9.762579730687456e-05,
"loss": 2.0783,
"step": 479
},
{
"epoch": 0.14569737441189862,
"grad_norm": 0.36482739448547363,
"learning_rate": 9.762073504100436e-05,
"loss": 2.2524,
"step": 480
},
{
"epoch": 0.14600091060859008,
"grad_norm": 0.3704656958580017,
"learning_rate": 9.761567277513415e-05,
"loss": 2.0369,
"step": 481
},
{
"epoch": 0.14630444680528154,
"grad_norm": 1.588393211364746,
"learning_rate": 9.761061050926396e-05,
"loss": 1.8041,
"step": 482
},
{
"epoch": 0.14660798300197297,
"grad_norm": 0.3309743404388428,
"learning_rate": 9.760554824339376e-05,
"loss": 1.8373,
"step": 483
},
{
"epoch": 0.14691151919866444,
"grad_norm": 0.34598830342292786,
"learning_rate": 9.760048597752355e-05,
"loss": 1.6249,
"step": 484
},
{
"epoch": 0.1472150553953559,
"grad_norm": 0.3433639109134674,
"learning_rate": 9.759542371165335e-05,
"loss": 1.9454,
"step": 485
},
{
"epoch": 0.14751859159204736,
"grad_norm": 0.3801734149456024,
"learning_rate": 9.759036144578314e-05,
"loss": 2.1067,
"step": 486
},
{
"epoch": 0.1478221277887388,
"grad_norm": 0.36811041831970215,
"learning_rate": 9.758529917991293e-05,
"loss": 1.8642,
"step": 487
},
{
"epoch": 0.14812566398543026,
"grad_norm": 0.3999156355857849,
"learning_rate": 9.758023691404273e-05,
"loss": 2.1482,
"step": 488
},
{
"epoch": 0.14842920018212172,
"grad_norm": 0.7651489973068237,
"learning_rate": 9.757517464817252e-05,
"loss": 1.8213,
"step": 489
},
{
"epoch": 0.14873273637881318,
"grad_norm": 0.3491712808609009,
"learning_rate": 9.757011238230232e-05,
"loss": 2.1047,
"step": 490
},
{
"epoch": 0.14903627257550464,
"grad_norm": 1.028256893157959,
"learning_rate": 9.756505011643213e-05,
"loss": 2.0519,
"step": 491
},
{
"epoch": 0.14933980877219608,
"grad_norm": 0.5957101583480835,
"learning_rate": 9.755998785056192e-05,
"loss": 2.1236,
"step": 492
},
{
"epoch": 0.14964334496888754,
"grad_norm": 0.40934717655181885,
"learning_rate": 9.755492558469172e-05,
"loss": 1.5391,
"step": 493
},
{
"epoch": 0.149946881165579,
"grad_norm": 0.4403507709503174,
"learning_rate": 9.754986331882151e-05,
"loss": 1.8388,
"step": 494
},
{
"epoch": 0.15025041736227046,
"grad_norm": 0.4258563220500946,
"learning_rate": 9.754480105295131e-05,
"loss": 1.8092,
"step": 495
},
{
"epoch": 0.1505539535589619,
"grad_norm": 0.3594823181629181,
"learning_rate": 9.75397387870811e-05,
"loss": 1.7195,
"step": 496
},
{
"epoch": 0.15085748975565336,
"grad_norm": 0.30373120307922363,
"learning_rate": 9.75346765212109e-05,
"loss": 1.9267,
"step": 497
},
{
"epoch": 0.15116102595234482,
"grad_norm": 0.423096626996994,
"learning_rate": 9.752961425534069e-05,
"loss": 2.1559,
"step": 498
},
{
"epoch": 0.15146456214903628,
"grad_norm": 0.36935552954673767,
"learning_rate": 9.752455198947049e-05,
"loss": 2.0357,
"step": 499
},
{
"epoch": 0.15176809834572771,
"grad_norm": 0.7172725200653076,
"learning_rate": 9.75194897236003e-05,
"loss": 2.0973,
"step": 500
},
{
"epoch": 0.15207163454241918,
"grad_norm": 0.36897605657577515,
"learning_rate": 9.751442745773009e-05,
"loss": 2.1672,
"step": 501
},
{
"epoch": 0.15237517073911064,
"grad_norm": 0.35079488158226013,
"learning_rate": 9.750936519185988e-05,
"loss": 2.0808,
"step": 502
},
{
"epoch": 0.1526787069358021,
"grad_norm": 0.37833186984062195,
"learning_rate": 9.750430292598968e-05,
"loss": 1.8393,
"step": 503
},
{
"epoch": 0.15298224313249356,
"grad_norm": 0.3969264328479767,
"learning_rate": 9.749924066011947e-05,
"loss": 2.1213,
"step": 504
},
{
"epoch": 0.153285779329185,
"grad_norm": 0.30432841181755066,
"learning_rate": 9.749417839424927e-05,
"loss": 1.6397,
"step": 505
},
{
"epoch": 0.15358931552587646,
"grad_norm": 0.30847886204719543,
"learning_rate": 9.748911612837906e-05,
"loss": 1.6455,
"step": 506
},
{
"epoch": 0.15389285172256792,
"grad_norm": 0.38480496406555176,
"learning_rate": 9.748405386250886e-05,
"loss": 1.803,
"step": 507
},
{
"epoch": 0.15419638791925938,
"grad_norm": 0.48439183831214905,
"learning_rate": 9.747899159663865e-05,
"loss": 1.6892,
"step": 508
},
{
"epoch": 0.15449992411595082,
"grad_norm": 0.5124354362487793,
"learning_rate": 9.747392933076845e-05,
"loss": 2.24,
"step": 509
},
{
"epoch": 0.15480346031264228,
"grad_norm": 0.4051717221736908,
"learning_rate": 9.746886706489826e-05,
"loss": 1.8621,
"step": 510
},
{
"epoch": 0.15510699650933374,
"grad_norm": 0.6452261209487915,
"learning_rate": 9.746380479902805e-05,
"loss": 1.7043,
"step": 511
},
{
"epoch": 0.1554105327060252,
"grad_norm": 0.5453522801399231,
"learning_rate": 9.745874253315785e-05,
"loss": 1.7325,
"step": 512
},
{
"epoch": 0.15571406890271666,
"grad_norm": 1.0983595848083496,
"learning_rate": 9.745368026728764e-05,
"loss": 2.169,
"step": 513
},
{
"epoch": 0.1560176050994081,
"grad_norm": 0.3821035623550415,
"learning_rate": 9.744861800141744e-05,
"loss": 2.3305,
"step": 514
},
{
"epoch": 0.15632114129609956,
"grad_norm": 0.3694508969783783,
"learning_rate": 9.744355573554723e-05,
"loss": 1.8453,
"step": 515
},
{
"epoch": 0.15662467749279102,
"grad_norm": 0.3837510943412781,
"learning_rate": 9.743849346967702e-05,
"loss": 1.9679,
"step": 516
},
{
"epoch": 0.15692821368948248,
"grad_norm": 0.41427966952323914,
"learning_rate": 9.743343120380682e-05,
"loss": 1.9331,
"step": 517
},
{
"epoch": 0.15723174988617392,
"grad_norm": 0.34252259135246277,
"learning_rate": 9.742836893793661e-05,
"loss": 1.7938,
"step": 518
},
{
"epoch": 0.15753528608286538,
"grad_norm": 0.4043283462524414,
"learning_rate": 9.742330667206642e-05,
"loss": 1.4037,
"step": 519
},
{
"epoch": 0.15783882227955684,
"grad_norm": 0.4225389361381531,
"learning_rate": 9.741824440619622e-05,
"loss": 1.6224,
"step": 520
},
{
"epoch": 0.1581423584762483,
"grad_norm": 0.377590537071228,
"learning_rate": 9.741318214032601e-05,
"loss": 2.0567,
"step": 521
},
{
"epoch": 0.15844589467293974,
"grad_norm": 0.46170124411582947,
"learning_rate": 9.740811987445582e-05,
"loss": 2.0449,
"step": 522
},
{
"epoch": 0.1587494308696312,
"grad_norm": 0.3752427399158478,
"learning_rate": 9.740305760858562e-05,
"loss": 1.8207,
"step": 523
},
{
"epoch": 0.15905296706632266,
"grad_norm": 0.390803724527359,
"learning_rate": 9.739799534271541e-05,
"loss": 2.0781,
"step": 524
},
{
"epoch": 0.15935650326301412,
"grad_norm": 0.38587453961372375,
"learning_rate": 9.73929330768452e-05,
"loss": 1.9932,
"step": 525
},
{
"epoch": 0.15966003945970558,
"grad_norm": 0.4154350459575653,
"learning_rate": 9.7387870810975e-05,
"loss": 1.7649,
"step": 526
},
{
"epoch": 0.15996357565639702,
"grad_norm": 0.3698589503765106,
"learning_rate": 9.73828085451048e-05,
"loss": 1.6921,
"step": 527
},
{
"epoch": 0.16026711185308848,
"grad_norm": 0.4110312759876251,
"learning_rate": 9.737774627923459e-05,
"loss": 1.1834,
"step": 528
},
{
"epoch": 0.16057064804977994,
"grad_norm": 0.4140758812427521,
"learning_rate": 9.737268401336438e-05,
"loss": 1.8354,
"step": 529
},
{
"epoch": 0.1608741842464714,
"grad_norm": 0.38738423585891724,
"learning_rate": 9.736762174749419e-05,
"loss": 1.9223,
"step": 530
},
{
"epoch": 0.16117772044316284,
"grad_norm": 0.4055260717868805,
"learning_rate": 9.736255948162399e-05,
"loss": 1.7802,
"step": 531
},
{
"epoch": 0.1614812566398543,
"grad_norm": 0.44946524500846863,
"learning_rate": 9.735749721575378e-05,
"loss": 1.8654,
"step": 532
},
{
"epoch": 0.16178479283654576,
"grad_norm": 0.43206432461738586,
"learning_rate": 9.735243494988358e-05,
"loss": 1.7607,
"step": 533
},
{
"epoch": 0.16208832903323722,
"grad_norm": 0.5007991194725037,
"learning_rate": 9.734737268401337e-05,
"loss": 1.9378,
"step": 534
},
{
"epoch": 0.16239186522992866,
"grad_norm": 0.48757919669151306,
"learning_rate": 9.734231041814317e-05,
"loss": 2.1829,
"step": 535
},
{
"epoch": 0.16269540142662012,
"grad_norm": 0.4159701466560364,
"learning_rate": 9.733724815227296e-05,
"loss": 1.8847,
"step": 536
},
{
"epoch": 0.16299893762331158,
"grad_norm": 0.40922749042510986,
"learning_rate": 9.733218588640276e-05,
"loss": 1.4376,
"step": 537
},
{
"epoch": 0.16330247382000304,
"grad_norm": 0.33677083253860474,
"learning_rate": 9.732712362053255e-05,
"loss": 1.9568,
"step": 538
},
{
"epoch": 0.1636060100166945,
"grad_norm": 0.3255022168159485,
"learning_rate": 9.732206135466236e-05,
"loss": 1.9949,
"step": 539
},
{
"epoch": 0.16390954621338594,
"grad_norm": 0.3848338723182678,
"learning_rate": 9.731699908879215e-05,
"loss": 2.042,
"step": 540
},
{
"epoch": 0.1642130824100774,
"grad_norm": 0.3888263404369354,
"learning_rate": 9.731193682292195e-05,
"loss": 1.885,
"step": 541
},
{
"epoch": 0.16451661860676886,
"grad_norm": 0.40090805292129517,
"learning_rate": 9.730687455705174e-05,
"loss": 1.9093,
"step": 542
},
{
"epoch": 0.16482015480346032,
"grad_norm": 0.4106220602989197,
"learning_rate": 9.730181229118154e-05,
"loss": 1.8392,
"step": 543
},
{
"epoch": 0.16512369100015176,
"grad_norm": 0.3483395278453827,
"learning_rate": 9.729675002531133e-05,
"loss": 2.0235,
"step": 544
},
{
"epoch": 0.16542722719684322,
"grad_norm": 0.3686208128929138,
"learning_rate": 9.729168775944113e-05,
"loss": 1.9218,
"step": 545
},
{
"epoch": 0.16573076339353468,
"grad_norm": 0.36063849925994873,
"learning_rate": 9.728662549357092e-05,
"loss": 1.9334,
"step": 546
},
{
"epoch": 0.16603429959022614,
"grad_norm": 0.39365142583847046,
"learning_rate": 9.728156322770072e-05,
"loss": 1.9825,
"step": 547
},
{
"epoch": 0.16633783578691758,
"grad_norm": 0.4062787592411041,
"learning_rate": 9.727650096183051e-05,
"loss": 1.521,
"step": 548
},
{
"epoch": 0.16664137198360904,
"grad_norm": 0.37347134947776794,
"learning_rate": 9.727143869596032e-05,
"loss": 1.9356,
"step": 549
},
{
"epoch": 0.1669449081803005,
"grad_norm": 0.3538997173309326,
"learning_rate": 9.726637643009012e-05,
"loss": 1.845,
"step": 550
},
{
"epoch": 0.16724844437699196,
"grad_norm": 0.3868335783481598,
"learning_rate": 9.726131416421991e-05,
"loss": 1.9803,
"step": 551
},
{
"epoch": 0.16755198057368342,
"grad_norm": 0.34705451130867004,
"learning_rate": 9.72562518983497e-05,
"loss": 2.0866,
"step": 552
},
{
"epoch": 0.16785551677037486,
"grad_norm": 0.3794872462749481,
"learning_rate": 9.72511896324795e-05,
"loss": 2.094,
"step": 553
},
{
"epoch": 0.16815905296706632,
"grad_norm": 0.5801231861114502,
"learning_rate": 9.72461273666093e-05,
"loss": 1.7851,
"step": 554
},
{
"epoch": 0.16846258916375778,
"grad_norm": 0.3076344132423401,
"learning_rate": 9.724106510073909e-05,
"loss": 1.5188,
"step": 555
},
{
"epoch": 0.16876612536044924,
"grad_norm": 0.3552989363670349,
"learning_rate": 9.723600283486888e-05,
"loss": 2.1063,
"step": 556
},
{
"epoch": 0.16906966155714068,
"grad_norm": 0.36939847469329834,
"learning_rate": 9.723094056899868e-05,
"loss": 1.7648,
"step": 557
},
{
"epoch": 0.16937319775383214,
"grad_norm": 0.358634889125824,
"learning_rate": 9.722587830312849e-05,
"loss": 1.8007,
"step": 558
},
{
"epoch": 0.1696767339505236,
"grad_norm": 0.39962029457092285,
"learning_rate": 9.722081603725828e-05,
"loss": 1.8845,
"step": 559
},
{
"epoch": 0.16998027014721506,
"grad_norm": 0.4099076986312866,
"learning_rate": 9.721575377138808e-05,
"loss": 1.8894,
"step": 560
},
{
"epoch": 0.17028380634390652,
"grad_norm": 0.3610551655292511,
"learning_rate": 9.721069150551787e-05,
"loss": 1.8089,
"step": 561
},
{
"epoch": 0.17058734254059796,
"grad_norm": 0.5951200723648071,
"learning_rate": 9.720562923964767e-05,
"loss": 1.6966,
"step": 562
},
{
"epoch": 0.17089087873728942,
"grad_norm": 0.562522292137146,
"learning_rate": 9.720056697377746e-05,
"loss": 1.7704,
"step": 563
},
{
"epoch": 0.17119441493398088,
"grad_norm": 0.6662526726722717,
"learning_rate": 9.719550470790726e-05,
"loss": 1.7714,
"step": 564
},
{
"epoch": 0.17149795113067234,
"grad_norm": 0.44034865498542786,
"learning_rate": 9.719044244203705e-05,
"loss": 2.1042,
"step": 565
},
{
"epoch": 0.17180148732736378,
"grad_norm": 0.39868202805519104,
"learning_rate": 9.718538017616685e-05,
"loss": 1.952,
"step": 566
},
{
"epoch": 0.17210502352405524,
"grad_norm": 0.3427380621433258,
"learning_rate": 9.718031791029665e-05,
"loss": 2.037,
"step": 567
},
{
"epoch": 0.1724085597207467,
"grad_norm": 0.37980929017066956,
"learning_rate": 9.717525564442645e-05,
"loss": 1.5378,
"step": 568
},
{
"epoch": 0.17271209591743816,
"grad_norm": 0.32314518094062805,
"learning_rate": 9.717019337855626e-05,
"loss": 1.6191,
"step": 569
},
{
"epoch": 0.1730156321141296,
"grad_norm": 0.40600740909576416,
"learning_rate": 9.716513111268605e-05,
"loss": 1.6055,
"step": 570
},
{
"epoch": 0.17331916831082106,
"grad_norm": 0.37318041920661926,
"learning_rate": 9.716006884681585e-05,
"loss": 1.8666,
"step": 571
},
{
"epoch": 0.17362270450751252,
"grad_norm": 0.3656068444252014,
"learning_rate": 9.715500658094564e-05,
"loss": 1.5983,
"step": 572
},
{
"epoch": 0.17392624070420398,
"grad_norm": 0.3546827733516693,
"learning_rate": 9.714994431507544e-05,
"loss": 2.2088,
"step": 573
},
{
"epoch": 0.17422977690089544,
"grad_norm": 0.4293152689933777,
"learning_rate": 9.714488204920523e-05,
"loss": 1.803,
"step": 574
},
{
"epoch": 0.17453331309758688,
"grad_norm": 0.3790314495563507,
"learning_rate": 9.713981978333503e-05,
"loss": 1.9874,
"step": 575
},
{
"epoch": 0.17483684929427834,
"grad_norm": 0.37619829177856445,
"learning_rate": 9.713475751746482e-05,
"loss": 1.9061,
"step": 576
},
{
"epoch": 0.1751403854909698,
"grad_norm": 0.36988991498947144,
"learning_rate": 9.712969525159462e-05,
"loss": 1.5463,
"step": 577
},
{
"epoch": 0.17544392168766126,
"grad_norm": 0.367721825838089,
"learning_rate": 9.712463298572442e-05,
"loss": 1.6526,
"step": 578
},
{
"epoch": 0.1757474578843527,
"grad_norm": 0.39620110392570496,
"learning_rate": 9.711957071985422e-05,
"loss": 2.056,
"step": 579
},
{
"epoch": 0.17605099408104416,
"grad_norm": 0.41518276929855347,
"learning_rate": 9.711450845398401e-05,
"loss": 1.6847,
"step": 580
},
{
"epoch": 0.17635453027773562,
"grad_norm": 0.3925170302391052,
"learning_rate": 9.710944618811381e-05,
"loss": 1.8476,
"step": 581
},
{
"epoch": 0.17665806647442708,
"grad_norm": 0.36658090353012085,
"learning_rate": 9.71043839222436e-05,
"loss": 2.0699,
"step": 582
},
{
"epoch": 0.17696160267111852,
"grad_norm": 0.3741433620452881,
"learning_rate": 9.70993216563734e-05,
"loss": 1.9645,
"step": 583
},
{
"epoch": 0.17726513886780998,
"grad_norm": 0.3742316663265228,
"learning_rate": 9.709425939050319e-05,
"loss": 2.3717,
"step": 584
},
{
"epoch": 0.17756867506450144,
"grad_norm": 0.3796440660953522,
"learning_rate": 9.708919712463299e-05,
"loss": 1.9356,
"step": 585
},
{
"epoch": 0.1778722112611929,
"grad_norm": 0.3976511061191559,
"learning_rate": 9.708413485876278e-05,
"loss": 2.1889,
"step": 586
},
{
"epoch": 0.17817574745788436,
"grad_norm": 0.34445542097091675,
"learning_rate": 9.707907259289258e-05,
"loss": 1.6535,
"step": 587
},
{
"epoch": 0.1784792836545758,
"grad_norm": 0.3982098698616028,
"learning_rate": 9.707401032702239e-05,
"loss": 2.0542,
"step": 588
},
{
"epoch": 0.17878281985126726,
"grad_norm": 0.42155295610427856,
"learning_rate": 9.706894806115218e-05,
"loss": 1.4605,
"step": 589
},
{
"epoch": 0.17908635604795872,
"grad_norm": 0.36341744661331177,
"learning_rate": 9.706388579528197e-05,
"loss": 1.8069,
"step": 590
},
{
"epoch": 0.17938989224465018,
"grad_norm": 0.3715178668498993,
"learning_rate": 9.705882352941177e-05,
"loss": 1.5512,
"step": 591
},
{
"epoch": 0.17969342844134162,
"grad_norm": 0.376767635345459,
"learning_rate": 9.705376126354156e-05,
"loss": 1.6027,
"step": 592
},
{
"epoch": 0.17999696463803308,
"grad_norm": 0.4033347964286804,
"learning_rate": 9.704869899767136e-05,
"loss": 1.5071,
"step": 593
},
{
"epoch": 0.18030050083472454,
"grad_norm": 0.8200478553771973,
"learning_rate": 9.704363673180115e-05,
"loss": 1.924,
"step": 594
},
{
"epoch": 0.180604037031416,
"grad_norm": 0.6224507093429565,
"learning_rate": 9.703857446593095e-05,
"loss": 1.9684,
"step": 595
},
{
"epoch": 0.18090757322810747,
"grad_norm": 0.32032859325408936,
"learning_rate": 9.703351220006074e-05,
"loss": 1.9478,
"step": 596
},
{
"epoch": 0.1812111094247989,
"grad_norm": 0.33331337571144104,
"learning_rate": 9.702844993419055e-05,
"loss": 1.8177,
"step": 597
},
{
"epoch": 0.18151464562149036,
"grad_norm": 0.47399207949638367,
"learning_rate": 9.702338766832035e-05,
"loss": 2.07,
"step": 598
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.30480411648750305,
"learning_rate": 9.701832540245014e-05,
"loss": 2.0407,
"step": 599
},
{
"epoch": 0.18212171801487329,
"grad_norm": 0.40148988366127014,
"learning_rate": 9.701326313657994e-05,
"loss": 1.8774,
"step": 600
},
{
"epoch": 0.18242525421156472,
"grad_norm": 0.3958423137664795,
"learning_rate": 9.700820087070973e-05,
"loss": 1.8462,
"step": 601
},
{
"epoch": 0.18272879040825618,
"grad_norm": 0.34824639558792114,
"learning_rate": 9.700313860483953e-05,
"loss": 1.7839,
"step": 602
},
{
"epoch": 0.18303232660494764,
"grad_norm": 0.38002872467041016,
"learning_rate": 9.699807633896932e-05,
"loss": 2.3237,
"step": 603
},
{
"epoch": 0.1833358628016391,
"grad_norm": 0.37800419330596924,
"learning_rate": 9.699301407309912e-05,
"loss": 1.9375,
"step": 604
},
{
"epoch": 0.18363939899833054,
"grad_norm": 0.4041115939617157,
"learning_rate": 9.698795180722891e-05,
"loss": 2.029,
"step": 605
},
{
"epoch": 0.183942935195022,
"grad_norm": 0.3697315454483032,
"learning_rate": 9.698288954135872e-05,
"loss": 1.894,
"step": 606
},
{
"epoch": 0.18424647139171346,
"grad_norm": 0.3809906542301178,
"learning_rate": 9.697782727548851e-05,
"loss": 1.8242,
"step": 607
},
{
"epoch": 0.18455000758840492,
"grad_norm": 0.3997717499732971,
"learning_rate": 9.697276500961831e-05,
"loss": 2.0522,
"step": 608
},
{
"epoch": 0.18485354378509639,
"grad_norm": 0.391699880361557,
"learning_rate": 9.69677027437481e-05,
"loss": 1.8521,
"step": 609
},
{
"epoch": 0.18515707998178782,
"grad_norm": 0.3667858839035034,
"learning_rate": 9.69626404778779e-05,
"loss": 1.7613,
"step": 610
},
{
"epoch": 0.18546061617847928,
"grad_norm": 0.3905411958694458,
"learning_rate": 9.69575782120077e-05,
"loss": 1.8285,
"step": 611
},
{
"epoch": 0.18576415237517074,
"grad_norm": 0.4121951758861542,
"learning_rate": 9.69525159461375e-05,
"loss": 1.8104,
"step": 612
},
{
"epoch": 0.1860676885718622,
"grad_norm": 0.34977591037750244,
"learning_rate": 9.69474536802673e-05,
"loss": 1.7737,
"step": 613
},
{
"epoch": 0.18637122476855364,
"grad_norm": 0.34084367752075195,
"learning_rate": 9.694239141439709e-05,
"loss": 2.0407,
"step": 614
},
{
"epoch": 0.1866747609652451,
"grad_norm": 0.35442525148391724,
"learning_rate": 9.693732914852689e-05,
"loss": 1.9152,
"step": 615
},
{
"epoch": 0.18697829716193656,
"grad_norm": 0.34404149651527405,
"learning_rate": 9.693226688265668e-05,
"loss": 1.7621,
"step": 616
},
{
"epoch": 0.18728183335862802,
"grad_norm": 0.4516477882862091,
"learning_rate": 9.692720461678649e-05,
"loss": 1.7624,
"step": 617
},
{
"epoch": 0.18758536955531946,
"grad_norm": 0.3506614565849304,
"learning_rate": 9.692214235091628e-05,
"loss": 1.6627,
"step": 618
},
{
"epoch": 0.18788890575201092,
"grad_norm": 0.9165719151496887,
"learning_rate": 9.691708008504608e-05,
"loss": 2.1926,
"step": 619
},
{
"epoch": 0.18819244194870238,
"grad_norm": 0.3361871838569641,
"learning_rate": 9.691201781917587e-05,
"loss": 1.5229,
"step": 620
},
{
"epoch": 0.18849597814539384,
"grad_norm": 0.32639381289482117,
"learning_rate": 9.690695555330567e-05,
"loss": 1.8778,
"step": 621
},
{
"epoch": 0.1887995143420853,
"grad_norm": 0.44261273741722107,
"learning_rate": 9.690189328743546e-05,
"loss": 2.0903,
"step": 622
},
{
"epoch": 0.18910305053877674,
"grad_norm": 0.4438890516757965,
"learning_rate": 9.689683102156526e-05,
"loss": 1.772,
"step": 623
},
{
"epoch": 0.1894065867354682,
"grad_norm": 0.40160682797431946,
"learning_rate": 9.689176875569505e-05,
"loss": 2.0964,
"step": 624
},
{
"epoch": 0.18971012293215966,
"grad_norm": 0.4022195637226105,
"learning_rate": 9.688670648982485e-05,
"loss": 1.7818,
"step": 625
},
{
"epoch": 0.19001365912885113,
"grad_norm": 0.4233214855194092,
"learning_rate": 9.688164422395464e-05,
"loss": 1.922,
"step": 626
},
{
"epoch": 0.19031719532554256,
"grad_norm": 0.3864254057407379,
"learning_rate": 9.687658195808445e-05,
"loss": 2.0279,
"step": 627
},
{
"epoch": 0.19062073152223402,
"grad_norm": 0.36527585983276367,
"learning_rate": 9.687151969221424e-05,
"loss": 2.0732,
"step": 628
},
{
"epoch": 0.19092426771892548,
"grad_norm": 0.399237722158432,
"learning_rate": 9.686645742634404e-05,
"loss": 1.8889,
"step": 629
},
{
"epoch": 0.19122780391561695,
"grad_norm": 0.3860459625720978,
"learning_rate": 9.686139516047383e-05,
"loss": 1.968,
"step": 630
},
{
"epoch": 0.19153134011230838,
"grad_norm": 0.32555973529815674,
"learning_rate": 9.685633289460363e-05,
"loss": 2.0722,
"step": 631
},
{
"epoch": 0.19183487630899984,
"grad_norm": 0.6093998551368713,
"learning_rate": 9.685127062873342e-05,
"loss": 1.8553,
"step": 632
},
{
"epoch": 0.1921384125056913,
"grad_norm": 0.4218057692050934,
"learning_rate": 9.684620836286322e-05,
"loss": 1.9647,
"step": 633
},
{
"epoch": 0.19244194870238276,
"grad_norm": 0.3779148757457733,
"learning_rate": 9.684114609699301e-05,
"loss": 2.0681,
"step": 634
},
{
"epoch": 0.19274548489907423,
"grad_norm": 0.3820381760597229,
"learning_rate": 9.683608383112281e-05,
"loss": 2.0603,
"step": 635
},
{
"epoch": 0.19304902109576566,
"grad_norm": 0.29337063431739807,
"learning_rate": 9.683102156525262e-05,
"loss": 1.7516,
"step": 636
},
{
"epoch": 0.19335255729245712,
"grad_norm": 0.4369249939918518,
"learning_rate": 9.682595929938241e-05,
"loss": 1.9822,
"step": 637
},
{
"epoch": 0.19365609348914858,
"grad_norm": 0.3766214847564697,
"learning_rate": 9.68208970335122e-05,
"loss": 1.7229,
"step": 638
},
{
"epoch": 0.19395962968584005,
"grad_norm": 0.4765011668205261,
"learning_rate": 9.6815834767642e-05,
"loss": 1.2865,
"step": 639
},
{
"epoch": 0.19426316588253148,
"grad_norm": 0.34236472845077515,
"learning_rate": 9.68107725017718e-05,
"loss": 2.1024,
"step": 640
},
{
"epoch": 0.19456670207922294,
"grad_norm": 0.398076593875885,
"learning_rate": 9.680571023590159e-05,
"loss": 1.8628,
"step": 641
},
{
"epoch": 0.1948702382759144,
"grad_norm": 0.357099711894989,
"learning_rate": 9.680064797003139e-05,
"loss": 2.2163,
"step": 642
},
{
"epoch": 0.19517377447260587,
"grad_norm": 0.3296545445919037,
"learning_rate": 9.679558570416118e-05,
"loss": 1.8227,
"step": 643
},
{
"epoch": 0.19547731066929733,
"grad_norm": 0.36754927039146423,
"learning_rate": 9.679052343829098e-05,
"loss": 1.7179,
"step": 644
},
{
"epoch": 0.19578084686598876,
"grad_norm": 0.37275364995002747,
"learning_rate": 9.678546117242078e-05,
"loss": 1.6782,
"step": 645
},
{
"epoch": 0.19608438306268022,
"grad_norm": 0.3951006531715393,
"learning_rate": 9.678039890655058e-05,
"loss": 2.0756,
"step": 646
},
{
"epoch": 0.19638791925937168,
"grad_norm": 0.3560970425605774,
"learning_rate": 9.677533664068037e-05,
"loss": 1.8093,
"step": 647
},
{
"epoch": 0.19669145545606315,
"grad_norm": 0.31553730368614197,
"learning_rate": 9.677027437481017e-05,
"loss": 1.9174,
"step": 648
},
{
"epoch": 0.19699499165275458,
"grad_norm": 0.39949625730514526,
"learning_rate": 9.676521210893996e-05,
"loss": 1.6687,
"step": 649
},
{
"epoch": 0.19729852784944604,
"grad_norm": 0.37323635816574097,
"learning_rate": 9.676014984306976e-05,
"loss": 1.8149,
"step": 650
},
{
"epoch": 0.1976020640461375,
"grad_norm": 0.43527746200561523,
"learning_rate": 9.675508757719955e-05,
"loss": 1.8744,
"step": 651
},
{
"epoch": 0.19790560024282897,
"grad_norm": 0.39380425214767456,
"learning_rate": 9.675002531132935e-05,
"loss": 1.9721,
"step": 652
},
{
"epoch": 0.1982091364395204,
"grad_norm": 0.3384545147418976,
"learning_rate": 9.674496304545914e-05,
"loss": 2.0122,
"step": 653
},
{
"epoch": 0.19851267263621186,
"grad_norm": 0.39647915959358215,
"learning_rate": 9.673990077958894e-05,
"loss": 2.2419,
"step": 654
},
{
"epoch": 0.19881620883290332,
"grad_norm": 0.3358941674232483,
"learning_rate": 9.673483851371875e-05,
"loss": 1.8758,
"step": 655
},
{
"epoch": 0.19911974502959479,
"grad_norm": 0.3486049771308899,
"learning_rate": 9.672977624784855e-05,
"loss": 1.5762,
"step": 656
},
{
"epoch": 0.19942328122628625,
"grad_norm": 2.3050696849823,
"learning_rate": 9.672471398197835e-05,
"loss": 2.0056,
"step": 657
},
{
"epoch": 0.19972681742297768,
"grad_norm": 0.35023945569992065,
"learning_rate": 9.671965171610814e-05,
"loss": 1.619,
"step": 658
},
{
"epoch": 0.20003035361966914,
"grad_norm": 0.513656735420227,
"learning_rate": 9.671458945023794e-05,
"loss": 1.5269,
"step": 659
},
{
"epoch": 0.2003338898163606,
"grad_norm": 0.37498149275779724,
"learning_rate": 9.670952718436773e-05,
"loss": 1.8553,
"step": 660
},
{
"epoch": 0.20063742601305207,
"grad_norm": 0.4101942479610443,
"learning_rate": 9.670446491849753e-05,
"loss": 2.1121,
"step": 661
},
{
"epoch": 0.2009409622097435,
"grad_norm": 0.4265679717063904,
"learning_rate": 9.669940265262732e-05,
"loss": 2.1863,
"step": 662
},
{
"epoch": 0.20124449840643496,
"grad_norm": 4.817168712615967,
"learning_rate": 9.669434038675712e-05,
"loss": 2.0906,
"step": 663
},
{
"epoch": 0.20154803460312642,
"grad_norm": 7.518252849578857,
"learning_rate": 9.668927812088691e-05,
"loss": 1.8889,
"step": 664
},
{
"epoch": 0.2018515707998179,
"grad_norm": 0.5480749011039734,
"learning_rate": 9.66842158550167e-05,
"loss": 1.8439,
"step": 665
},
{
"epoch": 0.20215510699650932,
"grad_norm": 0.3578292429447174,
"learning_rate": 9.667915358914651e-05,
"loss": 1.8742,
"step": 666
},
{
"epoch": 0.20245864319320078,
"grad_norm": 0.3799275755882263,
"learning_rate": 9.667409132327631e-05,
"loss": 1.994,
"step": 667
},
{
"epoch": 0.20276217938989224,
"grad_norm": 0.3736335039138794,
"learning_rate": 9.66690290574061e-05,
"loss": 1.7933,
"step": 668
},
{
"epoch": 0.2030657155865837,
"grad_norm": 0.3145211637020111,
"learning_rate": 9.66639667915359e-05,
"loss": 1.8193,
"step": 669
},
{
"epoch": 0.20336925178327517,
"grad_norm": 0.4940774142742157,
"learning_rate": 9.66589045256657e-05,
"loss": 1.9238,
"step": 670
},
{
"epoch": 0.2036727879799666,
"grad_norm": 0.431134968996048,
"learning_rate": 9.665384225979549e-05,
"loss": 1.5493,
"step": 671
},
{
"epoch": 0.20397632417665806,
"grad_norm": 0.41438859701156616,
"learning_rate": 9.664877999392528e-05,
"loss": 1.2076,
"step": 672
},
{
"epoch": 0.20427986037334953,
"grad_norm": 0.38191312551498413,
"learning_rate": 9.664371772805508e-05,
"loss": 1.8201,
"step": 673
},
{
"epoch": 0.204583396570041,
"grad_norm": 0.3938577175140381,
"learning_rate": 9.663865546218487e-05,
"loss": 1.5166,
"step": 674
},
{
"epoch": 0.20488693276673242,
"grad_norm": 0.46312233805656433,
"learning_rate": 9.663359319631468e-05,
"loss": 1.4652,
"step": 675
},
{
"epoch": 0.20519046896342388,
"grad_norm": 0.4087234139442444,
"learning_rate": 9.662853093044448e-05,
"loss": 1.8288,
"step": 676
},
{
"epoch": 0.20549400516011535,
"grad_norm": 0.37329304218292236,
"learning_rate": 9.662346866457427e-05,
"loss": 1.9084,
"step": 677
},
{
"epoch": 0.2057975413568068,
"grad_norm": 0.37109607458114624,
"learning_rate": 9.661840639870407e-05,
"loss": 1.9674,
"step": 678
},
{
"epoch": 0.20610107755349824,
"grad_norm": 0.3936561942100525,
"learning_rate": 9.661334413283386e-05,
"loss": 2.0342,
"step": 679
},
{
"epoch": 0.2064046137501897,
"grad_norm": 0.4621008634567261,
"learning_rate": 9.660828186696366e-05,
"loss": 1.5157,
"step": 680
},
{
"epoch": 0.20670814994688116,
"grad_norm": 0.3849358558654785,
"learning_rate": 9.660321960109345e-05,
"loss": 2.1513,
"step": 681
},
{
"epoch": 0.20701168614357263,
"grad_norm": 0.4873330295085907,
"learning_rate": 9.659815733522325e-05,
"loss": 1.9116,
"step": 682
},
{
"epoch": 0.2073152223402641,
"grad_norm": 0.4687885642051697,
"learning_rate": 9.659309506935304e-05,
"loss": 2.278,
"step": 683
},
{
"epoch": 0.20761875853695552,
"grad_norm": 0.3966952860355377,
"learning_rate": 9.658803280348285e-05,
"loss": 1.4625,
"step": 684
},
{
"epoch": 0.20792229473364698,
"grad_norm": 0.5782402157783508,
"learning_rate": 9.658297053761264e-05,
"loss": 2.2779,
"step": 685
},
{
"epoch": 0.20822583093033845,
"grad_norm": 0.37465688586235046,
"learning_rate": 9.657790827174244e-05,
"loss": 1.8462,
"step": 686
},
{
"epoch": 0.2085293671270299,
"grad_norm": 0.34408631920814514,
"learning_rate": 9.657284600587223e-05,
"loss": 1.9881,
"step": 687
},
{
"epoch": 0.20883290332372134,
"grad_norm": 0.6892307996749878,
"learning_rate": 9.656778374000203e-05,
"loss": 1.9835,
"step": 688
},
{
"epoch": 0.2091364395204128,
"grad_norm": 0.3698042631149292,
"learning_rate": 9.656272147413182e-05,
"loss": 2.0665,
"step": 689
},
{
"epoch": 0.20943997571710427,
"grad_norm": 0.41265738010406494,
"learning_rate": 9.655765920826162e-05,
"loss": 2.0231,
"step": 690
},
{
"epoch": 0.20974351191379573,
"grad_norm": 0.38251030445098877,
"learning_rate": 9.655259694239141e-05,
"loss": 1.7058,
"step": 691
},
{
"epoch": 0.2100470481104872,
"grad_norm": 0.468905508518219,
"learning_rate": 9.65475346765212e-05,
"loss": 1.6182,
"step": 692
},
{
"epoch": 0.21035058430717862,
"grad_norm": 1.0570484399795532,
"learning_rate": 9.6542472410651e-05,
"loss": 2.0165,
"step": 693
},
{
"epoch": 0.21065412050387008,
"grad_norm": 0.3978007435798645,
"learning_rate": 9.653741014478081e-05,
"loss": 1.7859,
"step": 694
},
{
"epoch": 0.21095765670056155,
"grad_norm": 0.42616939544677734,
"learning_rate": 9.65323478789106e-05,
"loss": 1.5197,
"step": 695
},
{
"epoch": 0.211261192897253,
"grad_norm": 0.39380377531051636,
"learning_rate": 9.65272856130404e-05,
"loss": 1.3796,
"step": 696
},
{
"epoch": 0.21156472909394444,
"grad_norm": 0.38581010699272156,
"learning_rate": 9.65222233471702e-05,
"loss": 1.8214,
"step": 697
},
{
"epoch": 0.2118682652906359,
"grad_norm": 0.3610150218009949,
"learning_rate": 9.651716108129999e-05,
"loss": 1.897,
"step": 698
},
{
"epoch": 0.21217180148732737,
"grad_norm": 0.44913700222969055,
"learning_rate": 9.651209881542978e-05,
"loss": 1.8873,
"step": 699
},
{
"epoch": 0.21247533768401883,
"grad_norm": 1.9599745273590088,
"learning_rate": 9.650703654955959e-05,
"loss": 1.946,
"step": 700
},
{
"epoch": 0.21277887388071026,
"grad_norm": 1.195716381072998,
"learning_rate": 9.650197428368939e-05,
"loss": 1.8749,
"step": 701
},
{
"epoch": 0.21308241007740172,
"grad_norm": 0.3154665231704712,
"learning_rate": 9.649691201781918e-05,
"loss": 1.5924,
"step": 702
},
{
"epoch": 0.21338594627409319,
"grad_norm": 0.3550672233104706,
"learning_rate": 9.649184975194898e-05,
"loss": 1.6094,
"step": 703
},
{
"epoch": 0.21368948247078465,
"grad_norm": 0.33744126558303833,
"learning_rate": 9.648678748607877e-05,
"loss": 1.3399,
"step": 704
},
{
"epoch": 0.2139930186674761,
"grad_norm": 0.33931079506874084,
"learning_rate": 9.648172522020858e-05,
"loss": 2.0096,
"step": 705
},
{
"epoch": 0.21429655486416754,
"grad_norm": 0.38951364159584045,
"learning_rate": 9.647666295433837e-05,
"loss": 1.7676,
"step": 706
},
{
"epoch": 0.214600091060859,
"grad_norm": 0.408087819814682,
"learning_rate": 9.647160068846817e-05,
"loss": 1.7948,
"step": 707
},
{
"epoch": 0.21490362725755047,
"grad_norm": 0.37058812379837036,
"learning_rate": 9.646653842259796e-05,
"loss": 1.9891,
"step": 708
},
{
"epoch": 0.21520716345424193,
"grad_norm": 0.4003254473209381,
"learning_rate": 9.646147615672776e-05,
"loss": 1.8895,
"step": 709
},
{
"epoch": 0.21551069965093336,
"grad_norm": 0.38838204741477966,
"learning_rate": 9.645641389085755e-05,
"loss": 2.0121,
"step": 710
},
{
"epoch": 0.21581423584762482,
"grad_norm": 0.41912707686424255,
"learning_rate": 9.645135162498735e-05,
"loss": 1.9804,
"step": 711
},
{
"epoch": 0.2161177720443163,
"grad_norm": 0.353454053401947,
"learning_rate": 9.644628935911714e-05,
"loss": 2.0478,
"step": 712
},
{
"epoch": 0.21642130824100775,
"grad_norm": 0.3825720548629761,
"learning_rate": 9.644122709324694e-05,
"loss": 1.6676,
"step": 713
},
{
"epoch": 0.21672484443769918,
"grad_norm": 0.4197389781475067,
"learning_rate": 9.643616482737675e-05,
"loss": 1.9732,
"step": 714
},
{
"epoch": 0.21702838063439064,
"grad_norm": 0.4452435076236725,
"learning_rate": 9.643110256150654e-05,
"loss": 2.0918,
"step": 715
},
{
"epoch": 0.2173319168310821,
"grad_norm": 0.3366299271583557,
"learning_rate": 9.642604029563634e-05,
"loss": 1.7469,
"step": 716
},
{
"epoch": 0.21763545302777357,
"grad_norm": 0.31280553340911865,
"learning_rate": 9.642097802976613e-05,
"loss": 2.0348,
"step": 717
},
{
"epoch": 0.21793898922446503,
"grad_norm": 0.425503671169281,
"learning_rate": 9.641591576389593e-05,
"loss": 1.3629,
"step": 718
},
{
"epoch": 0.21824252542115646,
"grad_norm": 0.3986441493034363,
"learning_rate": 9.641085349802572e-05,
"loss": 1.4703,
"step": 719
},
{
"epoch": 0.21854606161784793,
"grad_norm": 0.34377026557922363,
"learning_rate": 9.640579123215552e-05,
"loss": 1.9788,
"step": 720
},
{
"epoch": 0.2188495978145394,
"grad_norm": 0.3445621430873871,
"learning_rate": 9.640072896628531e-05,
"loss": 1.9137,
"step": 721
},
{
"epoch": 0.21915313401123085,
"grad_norm": 0.40363574028015137,
"learning_rate": 9.63956667004151e-05,
"loss": 1.8911,
"step": 722
},
{
"epoch": 0.21945667020792228,
"grad_norm": 0.36166059970855713,
"learning_rate": 9.639060443454491e-05,
"loss": 1.9176,
"step": 723
},
{
"epoch": 0.21976020640461374,
"grad_norm": 0.7732321619987488,
"learning_rate": 9.638554216867471e-05,
"loss": 2.1942,
"step": 724
},
{
"epoch": 0.2200637426013052,
"grad_norm": 0.4042604863643646,
"learning_rate": 9.63804799028045e-05,
"loss": 1.8964,
"step": 725
},
{
"epoch": 0.22036727879799667,
"grad_norm": 0.3888862133026123,
"learning_rate": 9.63754176369343e-05,
"loss": 1.716,
"step": 726
},
{
"epoch": 0.22067081499468813,
"grad_norm": 0.32185250520706177,
"learning_rate": 9.637035537106409e-05,
"loss": 2.1227,
"step": 727
},
{
"epoch": 0.22097435119137956,
"grad_norm": 0.36421746015548706,
"learning_rate": 9.636529310519389e-05,
"loss": 1.3262,
"step": 728
},
{
"epoch": 0.22127788738807103,
"grad_norm": 0.42780765891075134,
"learning_rate": 9.636023083932368e-05,
"loss": 1.806,
"step": 729
},
{
"epoch": 0.2215814235847625,
"grad_norm": 0.3754510283470154,
"learning_rate": 9.635516857345348e-05,
"loss": 1.9286,
"step": 730
},
{
"epoch": 0.22188495978145395,
"grad_norm": 0.35199174284935,
"learning_rate": 9.635010630758327e-05,
"loss": 1.9703,
"step": 731
},
{
"epoch": 0.22218849597814538,
"grad_norm": 0.36272746324539185,
"learning_rate": 9.634504404171307e-05,
"loss": 1.7773,
"step": 732
},
{
"epoch": 0.22249203217483685,
"grad_norm": 0.4233802556991577,
"learning_rate": 9.633998177584287e-05,
"loss": 2.0016,
"step": 733
},
{
"epoch": 0.2227955683715283,
"grad_norm": 0.46138089895248413,
"learning_rate": 9.633491950997267e-05,
"loss": 1.764,
"step": 734
},
{
"epoch": 0.22309910456821977,
"grad_norm": 0.37863031029701233,
"learning_rate": 9.632985724410246e-05,
"loss": 1.6493,
"step": 735
},
{
"epoch": 0.2234026407649112,
"grad_norm": 0.4493837356567383,
"learning_rate": 9.632479497823226e-05,
"loss": 2.04,
"step": 736
},
{
"epoch": 0.22370617696160267,
"grad_norm": 0.581119179725647,
"learning_rate": 9.631973271236205e-05,
"loss": 1.777,
"step": 737
},
{
"epoch": 0.22400971315829413,
"grad_norm": 0.3730584979057312,
"learning_rate": 9.631467044649185e-05,
"loss": 1.8932,
"step": 738
},
{
"epoch": 0.2243132493549856,
"grad_norm": 0.351421594619751,
"learning_rate": 9.630960818062164e-05,
"loss": 2.3182,
"step": 739
},
{
"epoch": 0.22461678555167705,
"grad_norm": 0.4237976670265198,
"learning_rate": 9.630454591475144e-05,
"loss": 2.1315,
"step": 740
},
{
"epoch": 0.22492032174836848,
"grad_norm": 0.38544562458992004,
"learning_rate": 9.629948364888123e-05,
"loss": 1.9596,
"step": 741
},
{
"epoch": 0.22522385794505995,
"grad_norm": 0.407672256231308,
"learning_rate": 9.629442138301104e-05,
"loss": 1.8694,
"step": 742
},
{
"epoch": 0.2255273941417514,
"grad_norm": 0.4415782690048218,
"learning_rate": 9.628935911714084e-05,
"loss": 1.8658,
"step": 743
},
{
"epoch": 0.22583093033844287,
"grad_norm": 0.41300657391548157,
"learning_rate": 9.628429685127063e-05,
"loss": 2.0477,
"step": 744
},
{
"epoch": 0.2261344665351343,
"grad_norm": 0.36000654101371765,
"learning_rate": 9.627923458540044e-05,
"loss": 1.9045,
"step": 745
},
{
"epoch": 0.22643800273182577,
"grad_norm": 0.42653003334999084,
"learning_rate": 9.627417231953023e-05,
"loss": 1.2151,
"step": 746
},
{
"epoch": 0.22674153892851723,
"grad_norm": 0.4157649874687195,
"learning_rate": 9.626911005366003e-05,
"loss": 1.9335,
"step": 747
},
{
"epoch": 0.2270450751252087,
"grad_norm": 0.3805077373981476,
"learning_rate": 9.626404778778982e-05,
"loss": 2.0803,
"step": 748
},
{
"epoch": 0.22734861132190012,
"grad_norm": 0.39710867404937744,
"learning_rate": 9.625898552191962e-05,
"loss": 2.2628,
"step": 749
},
{
"epoch": 0.22765214751859159,
"grad_norm": 0.4012609124183655,
"learning_rate": 9.625392325604941e-05,
"loss": 1.9586,
"step": 750
},
{
"epoch": 0.22795568371528305,
"grad_norm": 0.9281008243560791,
"learning_rate": 9.624886099017921e-05,
"loss": 1.168,
"step": 751
},
{
"epoch": 0.2282592199119745,
"grad_norm": 0.36847764253616333,
"learning_rate": 9.6243798724309e-05,
"loss": 1.8907,
"step": 752
},
{
"epoch": 0.22856275610866597,
"grad_norm": 0.4531751573085785,
"learning_rate": 9.623873645843881e-05,
"loss": 1.4511,
"step": 753
},
{
"epoch": 0.2288662923053574,
"grad_norm": 0.36623820662498474,
"learning_rate": 9.62336741925686e-05,
"loss": 1.6707,
"step": 754
},
{
"epoch": 0.22916982850204887,
"grad_norm": 0.3104342222213745,
"learning_rate": 9.62286119266984e-05,
"loss": 1.988,
"step": 755
},
{
"epoch": 0.22947336469874033,
"grad_norm": 0.3790084421634674,
"learning_rate": 9.62235496608282e-05,
"loss": 1.979,
"step": 756
},
{
"epoch": 0.2297769008954318,
"grad_norm": 0.3642970323562622,
"learning_rate": 9.621848739495799e-05,
"loss": 1.9998,
"step": 757
},
{
"epoch": 0.23008043709212322,
"grad_norm": 0.34588292241096497,
"learning_rate": 9.621342512908779e-05,
"loss": 2.0511,
"step": 758
},
{
"epoch": 0.2303839732888147,
"grad_norm": 0.3556496798992157,
"learning_rate": 9.620836286321758e-05,
"loss": 1.8785,
"step": 759
},
{
"epoch": 0.23068750948550615,
"grad_norm": 0.4669034779071808,
"learning_rate": 9.620330059734737e-05,
"loss": 1.5027,
"step": 760
},
{
"epoch": 0.2309910456821976,
"grad_norm": 0.39685994386672974,
"learning_rate": 9.619823833147717e-05,
"loss": 2.1644,
"step": 761
},
{
"epoch": 0.23129458187888904,
"grad_norm": 0.39183005690574646,
"learning_rate": 9.619317606560698e-05,
"loss": 1.9615,
"step": 762
},
{
"epoch": 0.2315981180755805,
"grad_norm": 0.36401331424713135,
"learning_rate": 9.618811379973677e-05,
"loss": 1.7535,
"step": 763
},
{
"epoch": 0.23190165427227197,
"grad_norm": 0.43118295073509216,
"learning_rate": 9.618305153386657e-05,
"loss": 1.884,
"step": 764
},
{
"epoch": 0.23220519046896343,
"grad_norm": 0.5061665177345276,
"learning_rate": 9.617798926799636e-05,
"loss": 2.0051,
"step": 765
},
{
"epoch": 0.2325087266656549,
"grad_norm": 0.4487472474575043,
"learning_rate": 9.617292700212616e-05,
"loss": 1.6831,
"step": 766
},
{
"epoch": 0.23281226286234633,
"grad_norm": 0.3660997450351715,
"learning_rate": 9.616786473625595e-05,
"loss": 1.9276,
"step": 767
},
{
"epoch": 0.2331157990590378,
"grad_norm": 0.3823026716709137,
"learning_rate": 9.616280247038575e-05,
"loss": 1.9817,
"step": 768
},
{
"epoch": 0.23341933525572925,
"grad_norm": 0.32568395137786865,
"learning_rate": 9.615774020451554e-05,
"loss": 1.508,
"step": 769
},
{
"epoch": 0.2337228714524207,
"grad_norm": 0.34985265135765076,
"learning_rate": 9.615267793864534e-05,
"loss": 1.6793,
"step": 770
},
{
"epoch": 0.23402640764911214,
"grad_norm": 0.38563957810401917,
"learning_rate": 9.614761567277513e-05,
"loss": 1.588,
"step": 771
},
{
"epoch": 0.2343299438458036,
"grad_norm": 0.33572301268577576,
"learning_rate": 9.614255340690494e-05,
"loss": 1.9541,
"step": 772
},
{
"epoch": 0.23463348004249507,
"grad_norm": 0.33936449885368347,
"learning_rate": 9.613749114103473e-05,
"loss": 1.9311,
"step": 773
},
{
"epoch": 0.23493701623918653,
"grad_norm": 0.34984657168388367,
"learning_rate": 9.613242887516453e-05,
"loss": 1.9532,
"step": 774
},
{
"epoch": 0.235240552435878,
"grad_norm": 0.3651373088359833,
"learning_rate": 9.612736660929432e-05,
"loss": 1.8815,
"step": 775
},
{
"epoch": 0.23554408863256943,
"grad_norm": 0.4317852854728699,
"learning_rate": 9.612230434342412e-05,
"loss": 2.0262,
"step": 776
},
{
"epoch": 0.2358476248292609,
"grad_norm": 0.375522255897522,
"learning_rate": 9.611724207755391e-05,
"loss": 1.9964,
"step": 777
},
{
"epoch": 0.23615116102595235,
"grad_norm": 0.37290844321250916,
"learning_rate": 9.611217981168371e-05,
"loss": 1.7456,
"step": 778
},
{
"epoch": 0.2364546972226438,
"grad_norm": 0.3768545985221863,
"learning_rate": 9.61071175458135e-05,
"loss": 1.9591,
"step": 779
},
{
"epoch": 0.23675823341933525,
"grad_norm": 0.3147246837615967,
"learning_rate": 9.61020552799433e-05,
"loss": 1.4033,
"step": 780
},
{
"epoch": 0.2370617696160267,
"grad_norm": 0.4480874240398407,
"learning_rate": 9.60969930140731e-05,
"loss": 1.9598,
"step": 781
},
{
"epoch": 0.23736530581271817,
"grad_norm": 0.7287562489509583,
"learning_rate": 9.60919307482029e-05,
"loss": 2.0097,
"step": 782
},
{
"epoch": 0.23766884200940963,
"grad_norm": 0.36199334263801575,
"learning_rate": 9.60868684823327e-05,
"loss": 1.8089,
"step": 783
},
{
"epoch": 0.23797237820610107,
"grad_norm": 0.32855263352394104,
"learning_rate": 9.608180621646249e-05,
"loss": 2.0199,
"step": 784
},
{
"epoch": 0.23827591440279253,
"grad_norm": 0.37182894349098206,
"learning_rate": 9.607674395059229e-05,
"loss": 1.7253,
"step": 785
},
{
"epoch": 0.238579450599484,
"grad_norm": 0.3365595042705536,
"learning_rate": 9.607168168472208e-05,
"loss": 1.9308,
"step": 786
},
{
"epoch": 0.23888298679617545,
"grad_norm": 0.400685578584671,
"learning_rate": 9.606661941885187e-05,
"loss": 1.8939,
"step": 787
},
{
"epoch": 0.2391865229928669,
"grad_norm": 0.6354159116744995,
"learning_rate": 9.606155715298167e-05,
"loss": 2.1476,
"step": 788
},
{
"epoch": 0.23949005918955835,
"grad_norm": 0.4196738600730896,
"learning_rate": 9.605649488711148e-05,
"loss": 1.8457,
"step": 789
},
{
"epoch": 0.2397935953862498,
"grad_norm": 0.35839545726776123,
"learning_rate": 9.605143262124127e-05,
"loss": 1.824,
"step": 790
},
{
"epoch": 0.24009713158294127,
"grad_norm": 0.3597940504550934,
"learning_rate": 9.604637035537107e-05,
"loss": 1.9583,
"step": 791
},
{
"epoch": 0.24040066777963273,
"grad_norm": 0.5783160924911499,
"learning_rate": 9.604130808950088e-05,
"loss": 2.2,
"step": 792
},
{
"epoch": 0.24070420397632417,
"grad_norm": 0.3544808030128479,
"learning_rate": 9.603624582363067e-05,
"loss": 2.1092,
"step": 793
},
{
"epoch": 0.24100774017301563,
"grad_norm": 0.41170623898506165,
"learning_rate": 9.603118355776047e-05,
"loss": 1.6004,
"step": 794
},
{
"epoch": 0.2413112763697071,
"grad_norm": 0.3832992613315582,
"learning_rate": 9.602612129189026e-05,
"loss": 1.4981,
"step": 795
},
{
"epoch": 0.24161481256639855,
"grad_norm": 0.5239993333816528,
"learning_rate": 9.602105902602006e-05,
"loss": 1.6026,
"step": 796
},
{
"epoch": 0.24191834876308999,
"grad_norm": 0.38445138931274414,
"learning_rate": 9.601599676014985e-05,
"loss": 1.5765,
"step": 797
},
{
"epoch": 0.24222188495978145,
"grad_norm": 0.38520511984825134,
"learning_rate": 9.601093449427964e-05,
"loss": 2.1069,
"step": 798
},
{
"epoch": 0.2425254211564729,
"grad_norm": 0.3519560694694519,
"learning_rate": 9.600587222840944e-05,
"loss": 1.8896,
"step": 799
},
{
"epoch": 0.24282895735316437,
"grad_norm": 0.5392457246780396,
"learning_rate": 9.600080996253923e-05,
"loss": 1.6273,
"step": 800
},
{
"epoch": 0.24313249354985583,
"grad_norm": 0.4213111996650696,
"learning_rate": 9.599574769666904e-05,
"loss": 1.489,
"step": 801
},
{
"epoch": 0.24343602974654727,
"grad_norm": 0.4006531834602356,
"learning_rate": 9.599068543079884e-05,
"loss": 1.9842,
"step": 802
},
{
"epoch": 0.24373956594323873,
"grad_norm": 0.3792324364185333,
"learning_rate": 9.598562316492863e-05,
"loss": 1.727,
"step": 803
},
{
"epoch": 0.2440431021399302,
"grad_norm": 0.3555270731449127,
"learning_rate": 9.598056089905843e-05,
"loss": 1.68,
"step": 804
},
{
"epoch": 0.24434663833662165,
"grad_norm": 0.33837342262268066,
"learning_rate": 9.597549863318822e-05,
"loss": 2.0709,
"step": 805
},
{
"epoch": 0.2446501745333131,
"grad_norm": 0.3812510371208191,
"learning_rate": 9.597043636731802e-05,
"loss": 2.1211,
"step": 806
},
{
"epoch": 0.24495371073000455,
"grad_norm": 0.33870792388916016,
"learning_rate": 9.596537410144781e-05,
"loss": 2.1047,
"step": 807
},
{
"epoch": 0.245257246926696,
"grad_norm": 0.3948252201080322,
"learning_rate": 9.59603118355776e-05,
"loss": 1.7553,
"step": 808
},
{
"epoch": 0.24556078312338747,
"grad_norm": 0.39410725235939026,
"learning_rate": 9.59552495697074e-05,
"loss": 1.9383,
"step": 809
},
{
"epoch": 0.2458643193200789,
"grad_norm": 0.37794989347457886,
"learning_rate": 9.59501873038372e-05,
"loss": 1.9115,
"step": 810
},
{
"epoch": 0.24616785551677037,
"grad_norm": 1.6270610094070435,
"learning_rate": 9.5945125037967e-05,
"loss": 1.8472,
"step": 811
},
{
"epoch": 0.24647139171346183,
"grad_norm": 0.3724587559700012,
"learning_rate": 9.59400627720968e-05,
"loss": 1.9087,
"step": 812
},
{
"epoch": 0.2467749279101533,
"grad_norm": 0.4097403585910797,
"learning_rate": 9.59350005062266e-05,
"loss": 1.8325,
"step": 813
},
{
"epoch": 0.24707846410684475,
"grad_norm": 0.4052940905094147,
"learning_rate": 9.592993824035639e-05,
"loss": 2.0241,
"step": 814
},
{
"epoch": 0.2473820003035362,
"grad_norm": 0.3887682557106018,
"learning_rate": 9.592487597448618e-05,
"loss": 1.6114,
"step": 815
},
{
"epoch": 0.24768553650022765,
"grad_norm": 0.404450386762619,
"learning_rate": 9.591981370861598e-05,
"loss": 1.8384,
"step": 816
},
{
"epoch": 0.2479890726969191,
"grad_norm": 0.7955893874168396,
"learning_rate": 9.591475144274577e-05,
"loss": 2.2149,
"step": 817
},
{
"epoch": 0.24829260889361057,
"grad_norm": 4.355859279632568,
"learning_rate": 9.590968917687557e-05,
"loss": 2.3753,
"step": 818
},
{
"epoch": 0.248596145090302,
"grad_norm": 0.3698444962501526,
"learning_rate": 9.590462691100536e-05,
"loss": 1.7354,
"step": 819
},
{
"epoch": 0.24889968128699347,
"grad_norm": 0.3658899962902069,
"learning_rate": 9.589956464513517e-05,
"loss": 1.7803,
"step": 820
},
{
"epoch": 0.24920321748368493,
"grad_norm": 0.405072957277298,
"learning_rate": 9.589450237926497e-05,
"loss": 1.7684,
"step": 821
},
{
"epoch": 0.2495067536803764,
"grad_norm": 0.7590973973274231,
"learning_rate": 9.588944011339476e-05,
"loss": 1.9466,
"step": 822
},
{
"epoch": 0.24981028987706785,
"grad_norm": 0.5217581987380981,
"learning_rate": 9.588437784752456e-05,
"loss": 2.1281,
"step": 823
},
{
"epoch": 0.2501138260737593,
"grad_norm": 0.3716435134410858,
"learning_rate": 9.587931558165435e-05,
"loss": 2.114,
"step": 824
},
{
"epoch": 0.25041736227045075,
"grad_norm": 0.44017624855041504,
"learning_rate": 9.587425331578414e-05,
"loss": 2.0445,
"step": 825
},
{
"epoch": 0.2507208984671422,
"grad_norm": 0.370370090007782,
"learning_rate": 9.586919104991394e-05,
"loss": 1.8674,
"step": 826
},
{
"epoch": 0.2510244346638337,
"grad_norm": 0.32125499844551086,
"learning_rate": 9.586412878404373e-05,
"loss": 1.4129,
"step": 827
},
{
"epoch": 0.2513279708605251,
"grad_norm": 0.4143073856830597,
"learning_rate": 9.585906651817353e-05,
"loss": 1.9895,
"step": 828
},
{
"epoch": 0.2516315070572166,
"grad_norm": 0.3492576777935028,
"learning_rate": 9.585400425230334e-05,
"loss": 2.0669,
"step": 829
},
{
"epoch": 0.25193504325390803,
"grad_norm": 0.4044751524925232,
"learning_rate": 9.584894198643313e-05,
"loss": 1.5909,
"step": 830
},
{
"epoch": 0.25223857945059946,
"grad_norm": 0.3410158157348633,
"learning_rate": 9.584387972056293e-05,
"loss": 1.7485,
"step": 831
},
{
"epoch": 0.25254211564729095,
"grad_norm": 0.340320348739624,
"learning_rate": 9.583881745469272e-05,
"loss": 1.8897,
"step": 832
},
{
"epoch": 0.2528456518439824,
"grad_norm": 0.35516276955604553,
"learning_rate": 9.583375518882252e-05,
"loss": 1.6332,
"step": 833
},
{
"epoch": 0.2531491880406738,
"grad_norm": 0.4099842309951782,
"learning_rate": 9.582869292295232e-05,
"loss": 1.5617,
"step": 834
},
{
"epoch": 0.2534527242373653,
"grad_norm": 0.38086098432540894,
"learning_rate": 9.582363065708212e-05,
"loss": 2.0837,
"step": 835
},
{
"epoch": 0.25375626043405675,
"grad_norm": 0.8040663003921509,
"learning_rate": 9.581856839121191e-05,
"loss": 1.8587,
"step": 836
},
{
"epoch": 0.25405979663074824,
"grad_norm": 0.41297683119773865,
"learning_rate": 9.581350612534171e-05,
"loss": 1.9602,
"step": 837
},
{
"epoch": 0.25436333282743967,
"grad_norm": 0.38155442476272583,
"learning_rate": 9.58084438594715e-05,
"loss": 1.375,
"step": 838
},
{
"epoch": 0.2546668690241311,
"grad_norm": 0.3956829905509949,
"learning_rate": 9.58033815936013e-05,
"loss": 1.9617,
"step": 839
},
{
"epoch": 0.2549704052208226,
"grad_norm": 0.38675928115844727,
"learning_rate": 9.579831932773111e-05,
"loss": 1.8186,
"step": 840
},
{
"epoch": 0.255273941417514,
"grad_norm": 0.33989018201828003,
"learning_rate": 9.57932570618609e-05,
"loss": 2.1734,
"step": 841
},
{
"epoch": 0.2555774776142055,
"grad_norm": 0.3240448534488678,
"learning_rate": 9.57881947959907e-05,
"loss": 1.6238,
"step": 842
},
{
"epoch": 0.25588101381089695,
"grad_norm": 0.6117075681686401,
"learning_rate": 9.578313253012049e-05,
"loss": 1.986,
"step": 843
},
{
"epoch": 0.2561845500075884,
"grad_norm": 0.3781290650367737,
"learning_rate": 9.577807026425029e-05,
"loss": 2.0021,
"step": 844
},
{
"epoch": 0.2564880862042799,
"grad_norm": 0.4373374879360199,
"learning_rate": 9.577300799838008e-05,
"loss": 2.0195,
"step": 845
},
{
"epoch": 0.2567916224009713,
"grad_norm": 0.4125923216342926,
"learning_rate": 9.576794573250988e-05,
"loss": 1.9412,
"step": 846
},
{
"epoch": 0.2570951585976628,
"grad_norm": 0.3557007908821106,
"learning_rate": 9.576288346663967e-05,
"loss": 1.8098,
"step": 847
},
{
"epoch": 0.25739869479435423,
"grad_norm": 0.49475541710853577,
"learning_rate": 9.575782120076947e-05,
"loss": 1.5756,
"step": 848
},
{
"epoch": 0.25770223099104567,
"grad_norm": 0.3507518768310547,
"learning_rate": 9.575275893489926e-05,
"loss": 1.6413,
"step": 849
},
{
"epoch": 0.25800576718773716,
"grad_norm": 0.39508333802223206,
"learning_rate": 9.574769666902907e-05,
"loss": 1.9777,
"step": 850
},
{
"epoch": 0.2583093033844286,
"grad_norm": 0.328807532787323,
"learning_rate": 9.574263440315886e-05,
"loss": 1.4948,
"step": 851
},
{
"epoch": 0.25861283958112,
"grad_norm": 0.3154551386833191,
"learning_rate": 9.573757213728866e-05,
"loss": 1.7809,
"step": 852
},
{
"epoch": 0.2589163757778115,
"grad_norm": 0.502554178237915,
"learning_rate": 9.573250987141845e-05,
"loss": 1.4369,
"step": 853
},
{
"epoch": 0.25921991197450295,
"grad_norm": 0.4416670799255371,
"learning_rate": 9.572744760554825e-05,
"loss": 1.7364,
"step": 854
},
{
"epoch": 0.25952344817119444,
"grad_norm": 0.43228060007095337,
"learning_rate": 9.572238533967804e-05,
"loss": 1.3281,
"step": 855
},
{
"epoch": 0.25982698436788587,
"grad_norm": 0.3714723289012909,
"learning_rate": 9.571732307380784e-05,
"loss": 2.0893,
"step": 856
},
{
"epoch": 0.2601305205645773,
"grad_norm": 0.3309679925441742,
"learning_rate": 9.571226080793763e-05,
"loss": 1.7982,
"step": 857
},
{
"epoch": 0.2604340567612688,
"grad_norm": 0.3709767460823059,
"learning_rate": 9.570719854206743e-05,
"loss": 1.8628,
"step": 858
},
{
"epoch": 0.26073759295796023,
"grad_norm": 0.6020816564559937,
"learning_rate": 9.570213627619724e-05,
"loss": 2.0077,
"step": 859
},
{
"epoch": 0.2610411291546517,
"grad_norm": 0.30620431900024414,
"learning_rate": 9.569707401032703e-05,
"loss": 1.8834,
"step": 860
},
{
"epoch": 0.26134466535134315,
"grad_norm": 0.41518962383270264,
"learning_rate": 9.569201174445683e-05,
"loss": 1.8025,
"step": 861
},
{
"epoch": 0.2616482015480346,
"grad_norm": 0.3919786512851715,
"learning_rate": 9.568694947858662e-05,
"loss": 1.995,
"step": 862
},
{
"epoch": 0.2619517377447261,
"grad_norm": 0.47429168224334717,
"learning_rate": 9.568188721271641e-05,
"loss": 1.9423,
"step": 863
},
{
"epoch": 0.2622552739414175,
"grad_norm": 0.8941421508789062,
"learning_rate": 9.567682494684621e-05,
"loss": 1.5046,
"step": 864
},
{
"epoch": 0.26255881013810894,
"grad_norm": 0.4357859194278717,
"learning_rate": 9.5671762680976e-05,
"loss": 2.0023,
"step": 865
},
{
"epoch": 0.26286234633480043,
"grad_norm": 0.3873944878578186,
"learning_rate": 9.56667004151058e-05,
"loss": 2.0607,
"step": 866
},
{
"epoch": 0.26316588253149187,
"grad_norm": 0.4355853497982025,
"learning_rate": 9.56616381492356e-05,
"loss": 1.8254,
"step": 867
},
{
"epoch": 0.26346941872818336,
"grad_norm": 0.3882213234901428,
"learning_rate": 9.56565758833654e-05,
"loss": 1.7809,
"step": 868
},
{
"epoch": 0.2637729549248748,
"grad_norm": 0.4021656811237335,
"learning_rate": 9.56515136174952e-05,
"loss": 2.1321,
"step": 869
},
{
"epoch": 0.2640764911215662,
"grad_norm": 0.43587526679039,
"learning_rate": 9.564645135162499e-05,
"loss": 1.7865,
"step": 870
},
{
"epoch": 0.2643800273182577,
"grad_norm": 0.364045649766922,
"learning_rate": 9.564138908575479e-05,
"loss": 1.8173,
"step": 871
},
{
"epoch": 0.26468356351494915,
"grad_norm": 0.3956625461578369,
"learning_rate": 9.563632681988458e-05,
"loss": 1.4822,
"step": 872
},
{
"epoch": 0.26498709971164064,
"grad_norm": 0.40755051374435425,
"learning_rate": 9.563126455401438e-05,
"loss": 1.9418,
"step": 873
},
{
"epoch": 0.2652906359083321,
"grad_norm": 0.39405086636543274,
"learning_rate": 9.562620228814417e-05,
"loss": 1.4529,
"step": 874
},
{
"epoch": 0.2655941721050235,
"grad_norm": 0.4400351047515869,
"learning_rate": 9.562114002227397e-05,
"loss": 2.1095,
"step": 875
},
{
"epoch": 0.265897708301715,
"grad_norm": 0.40135496854782104,
"learning_rate": 9.561607775640376e-05,
"loss": 1.9462,
"step": 876
},
{
"epoch": 0.26620124449840643,
"grad_norm": 0.5949604511260986,
"learning_rate": 9.561101549053356e-05,
"loss": 1.8797,
"step": 877
},
{
"epoch": 0.26650478069509786,
"grad_norm": 0.38301005959510803,
"learning_rate": 9.560595322466336e-05,
"loss": 2.0887,
"step": 878
},
{
"epoch": 0.26680831689178935,
"grad_norm": 0.6215627789497375,
"learning_rate": 9.560089095879317e-05,
"loss": 1.7846,
"step": 879
},
{
"epoch": 0.2671118530884808,
"grad_norm": 0.4041058123111725,
"learning_rate": 9.559582869292297e-05,
"loss": 1.5127,
"step": 880
},
{
"epoch": 0.2674153892851723,
"grad_norm": 0.30281975865364075,
"learning_rate": 9.559076642705276e-05,
"loss": 1.8487,
"step": 881
},
{
"epoch": 0.2677189254818637,
"grad_norm": 0.34536200761795044,
"learning_rate": 9.558570416118256e-05,
"loss": 1.8976,
"step": 882
},
{
"epoch": 0.26802246167855515,
"grad_norm": 0.367245614528656,
"learning_rate": 9.558064189531235e-05,
"loss": 1.9804,
"step": 883
},
{
"epoch": 0.26832599787524664,
"grad_norm": 0.41750359535217285,
"learning_rate": 9.557557962944215e-05,
"loss": 1.5932,
"step": 884
},
{
"epoch": 0.26862953407193807,
"grad_norm": 0.7777047157287598,
"learning_rate": 9.557051736357194e-05,
"loss": 1.8513,
"step": 885
},
{
"epoch": 0.26893307026862956,
"grad_norm": 0.3720252215862274,
"learning_rate": 9.556545509770174e-05,
"loss": 2.1819,
"step": 886
},
{
"epoch": 0.269236606465321,
"grad_norm": 0.7321712970733643,
"learning_rate": 9.556039283183153e-05,
"loss": 1.4653,
"step": 887
},
{
"epoch": 0.2695401426620124,
"grad_norm": 0.4140429198741913,
"learning_rate": 9.555533056596133e-05,
"loss": 1.9816,
"step": 888
},
{
"epoch": 0.2698436788587039,
"grad_norm": 0.40684935450553894,
"learning_rate": 9.555026830009113e-05,
"loss": 1.5866,
"step": 889
},
{
"epoch": 0.27014721505539535,
"grad_norm": 0.4067225754261017,
"learning_rate": 9.554520603422093e-05,
"loss": 1.5951,
"step": 890
},
{
"epoch": 0.2704507512520868,
"grad_norm": 0.34240391850471497,
"learning_rate": 9.554014376835072e-05,
"loss": 1.9076,
"step": 891
},
{
"epoch": 0.2707542874487783,
"grad_norm": 0.4634522795677185,
"learning_rate": 9.553508150248052e-05,
"loss": 1.9856,
"step": 892
},
{
"epoch": 0.2710578236454697,
"grad_norm": 0.408015638589859,
"learning_rate": 9.553001923661031e-05,
"loss": 1.7997,
"step": 893
},
{
"epoch": 0.2713613598421612,
"grad_norm": 0.3894648253917694,
"learning_rate": 9.552495697074011e-05,
"loss": 1.8381,
"step": 894
},
{
"epoch": 0.27166489603885263,
"grad_norm": 0.37494730949401855,
"learning_rate": 9.55198947048699e-05,
"loss": 2.0548,
"step": 895
},
{
"epoch": 0.27196843223554407,
"grad_norm": 0.39796411991119385,
"learning_rate": 9.55148324389997e-05,
"loss": 1.9272,
"step": 896
},
{
"epoch": 0.27227196843223556,
"grad_norm": 0.40153494477272034,
"learning_rate": 9.550977017312949e-05,
"loss": 1.7136,
"step": 897
},
{
"epoch": 0.272575504628927,
"grad_norm": 0.39771386981010437,
"learning_rate": 9.55047079072593e-05,
"loss": 2.1017,
"step": 898
},
{
"epoch": 0.2728790408256185,
"grad_norm": 0.4085974097251892,
"learning_rate": 9.54996456413891e-05,
"loss": 1.3951,
"step": 899
},
{
"epoch": 0.2731825770223099,
"grad_norm": 0.39849239587783813,
"learning_rate": 9.549458337551889e-05,
"loss": 1.9988,
"step": 900
},
{
"epoch": 0.27348611321900135,
"grad_norm": 0.38662001490592957,
"learning_rate": 9.548952110964868e-05,
"loss": 1.8491,
"step": 901
},
{
"epoch": 0.27378964941569284,
"grad_norm": 0.38078710436820984,
"learning_rate": 9.548445884377848e-05,
"loss": 1.9,
"step": 902
},
{
"epoch": 0.27409318561238427,
"grad_norm": 0.3548724949359894,
"learning_rate": 9.547939657790827e-05,
"loss": 1.8754,
"step": 903
},
{
"epoch": 0.2743967218090757,
"grad_norm": 0.37712323665618896,
"learning_rate": 9.547433431203807e-05,
"loss": 1.5497,
"step": 904
},
{
"epoch": 0.2747002580057672,
"grad_norm": 0.4060449004173279,
"learning_rate": 9.546927204616786e-05,
"loss": 1.7231,
"step": 905
},
{
"epoch": 0.27500379420245863,
"grad_norm": 0.42080479860305786,
"learning_rate": 9.546420978029766e-05,
"loss": 2.1538,
"step": 906
},
{
"epoch": 0.2753073303991501,
"grad_norm": 0.4034046232700348,
"learning_rate": 9.545914751442747e-05,
"loss": 1.7335,
"step": 907
},
{
"epoch": 0.27561086659584155,
"grad_norm": 0.3676345646381378,
"learning_rate": 9.545408524855726e-05,
"loss": 1.6193,
"step": 908
},
{
"epoch": 0.275914402792533,
"grad_norm": 0.3349851965904236,
"learning_rate": 9.544902298268706e-05,
"loss": 1.8997,
"step": 909
},
{
"epoch": 0.2762179389892245,
"grad_norm": 0.3676302134990692,
"learning_rate": 9.544396071681685e-05,
"loss": 1.4031,
"step": 910
},
{
"epoch": 0.2765214751859159,
"grad_norm": 0.36593666672706604,
"learning_rate": 9.543889845094665e-05,
"loss": 1.8838,
"step": 911
},
{
"epoch": 0.2768250113826074,
"grad_norm": 0.3793712258338928,
"learning_rate": 9.543383618507644e-05,
"loss": 1.5949,
"step": 912
},
{
"epoch": 0.27712854757929883,
"grad_norm": 0.47586631774902344,
"learning_rate": 9.542877391920624e-05,
"loss": 1.5687,
"step": 913
},
{
"epoch": 0.27743208377599027,
"grad_norm": 0.38850024342536926,
"learning_rate": 9.542371165333603e-05,
"loss": 1.7336,
"step": 914
},
{
"epoch": 0.27773561997268176,
"grad_norm": 0.4039680063724518,
"learning_rate": 9.541864938746583e-05,
"loss": 2.0476,
"step": 915
},
{
"epoch": 0.2780391561693732,
"grad_norm": 0.40498992800712585,
"learning_rate": 9.541358712159562e-05,
"loss": 1.6699,
"step": 916
},
{
"epoch": 0.2783426923660646,
"grad_norm": 0.39011168479919434,
"learning_rate": 9.540852485572543e-05,
"loss": 1.9935,
"step": 917
},
{
"epoch": 0.2786462285627561,
"grad_norm": 0.3864549696445465,
"learning_rate": 9.540346258985522e-05,
"loss": 1.8271,
"step": 918
},
{
"epoch": 0.27894976475944755,
"grad_norm": 0.33493247628211975,
"learning_rate": 9.539840032398502e-05,
"loss": 1.856,
"step": 919
},
{
"epoch": 0.27925330095613904,
"grad_norm": 0.34132060408592224,
"learning_rate": 9.539333805811481e-05,
"loss": 1.8836,
"step": 920
},
{
"epoch": 0.2795568371528305,
"grad_norm": 1.5312176942825317,
"learning_rate": 9.538827579224461e-05,
"loss": 2.0207,
"step": 921
},
{
"epoch": 0.2798603733495219,
"grad_norm": 0.333932489156723,
"learning_rate": 9.53832135263744e-05,
"loss": 2.0908,
"step": 922
},
{
"epoch": 0.2801639095462134,
"grad_norm": 0.3688269555568695,
"learning_rate": 9.537815126050421e-05,
"loss": 1.8464,
"step": 923
},
{
"epoch": 0.28046744574290483,
"grad_norm": 0.4097294211387634,
"learning_rate": 9.5373088994634e-05,
"loss": 1.6891,
"step": 924
},
{
"epoch": 0.2807709819395963,
"grad_norm": 0.3737453818321228,
"learning_rate": 9.53680267287638e-05,
"loss": 2.0549,
"step": 925
},
{
"epoch": 0.28107451813628775,
"grad_norm": 0.6109428405761719,
"learning_rate": 9.53629644628936e-05,
"loss": 1.9437,
"step": 926
},
{
"epoch": 0.2813780543329792,
"grad_norm": 0.46215322613716125,
"learning_rate": 9.535790219702339e-05,
"loss": 1.5133,
"step": 927
},
{
"epoch": 0.2816815905296707,
"grad_norm": 0.8070108294487,
"learning_rate": 9.53528399311532e-05,
"loss": 1.8843,
"step": 928
},
{
"epoch": 0.2819851267263621,
"grad_norm": 0.40304142236709595,
"learning_rate": 9.534777766528299e-05,
"loss": 1.9742,
"step": 929
},
{
"epoch": 0.2822886629230536,
"grad_norm": 0.35046708583831787,
"learning_rate": 9.534271539941279e-05,
"loss": 1.8969,
"step": 930
},
{
"epoch": 0.28259219911974504,
"grad_norm": 0.37241777777671814,
"learning_rate": 9.533765313354258e-05,
"loss": 1.8138,
"step": 931
},
{
"epoch": 0.28289573531643647,
"grad_norm": 0.38689473271369934,
"learning_rate": 9.533259086767238e-05,
"loss": 1.669,
"step": 932
},
{
"epoch": 0.28319927151312796,
"grad_norm": 0.3672066926956177,
"learning_rate": 9.532752860180217e-05,
"loss": 1.9093,
"step": 933
},
{
"epoch": 0.2835028077098194,
"grad_norm": 0.4022217392921448,
"learning_rate": 9.532246633593197e-05,
"loss": 1.6959,
"step": 934
},
{
"epoch": 0.2838063439065108,
"grad_norm": 0.3894721269607544,
"learning_rate": 9.531740407006176e-05,
"loss": 1.9898,
"step": 935
},
{
"epoch": 0.2841098801032023,
"grad_norm": 0.4395015835762024,
"learning_rate": 9.531234180419156e-05,
"loss": 1.5538,
"step": 936
},
{
"epoch": 0.28441341629989375,
"grad_norm": 0.8121886849403381,
"learning_rate": 9.530727953832136e-05,
"loss": 1.7403,
"step": 937
},
{
"epoch": 0.28471695249658524,
"grad_norm": 0.40073227882385254,
"learning_rate": 9.530221727245116e-05,
"loss": 2.0544,
"step": 938
},
{
"epoch": 0.2850204886932767,
"grad_norm": 0.3571331202983856,
"learning_rate": 9.529715500658095e-05,
"loss": 1.7157,
"step": 939
},
{
"epoch": 0.2853240248899681,
"grad_norm": 0.485147625207901,
"learning_rate": 9.529209274071075e-05,
"loss": 2.1489,
"step": 940
},
{
"epoch": 0.2856275610866596,
"grad_norm": 0.6882160305976868,
"learning_rate": 9.528703047484054e-05,
"loss": 1.8458,
"step": 941
},
{
"epoch": 0.28593109728335103,
"grad_norm": 0.7156968116760254,
"learning_rate": 9.528196820897034e-05,
"loss": 1.9529,
"step": 942
},
{
"epoch": 0.2862346334800425,
"grad_norm": 0.4198112487792969,
"learning_rate": 9.527690594310013e-05,
"loss": 2.0355,
"step": 943
},
{
"epoch": 0.28653816967673396,
"grad_norm": 0.4178343117237091,
"learning_rate": 9.527184367722993e-05,
"loss": 1.5801,
"step": 944
},
{
"epoch": 0.2868417058734254,
"grad_norm": 0.3721866011619568,
"learning_rate": 9.526678141135972e-05,
"loss": 2.1657,
"step": 945
},
{
"epoch": 0.2871452420701169,
"grad_norm": 0.38586944341659546,
"learning_rate": 9.526171914548953e-05,
"loss": 1.4879,
"step": 946
},
{
"epoch": 0.2874487782668083,
"grad_norm": 0.42727598547935486,
"learning_rate": 9.525665687961933e-05,
"loss": 1.8434,
"step": 947
},
{
"epoch": 0.28775231446349975,
"grad_norm": 0.3686284124851227,
"learning_rate": 9.525159461374912e-05,
"loss": 1.9346,
"step": 948
},
{
"epoch": 0.28805585066019124,
"grad_norm": 0.41984260082244873,
"learning_rate": 9.524653234787892e-05,
"loss": 1.4474,
"step": 949
},
{
"epoch": 0.28835938685688267,
"grad_norm": 0.4530123174190521,
"learning_rate": 9.524147008200871e-05,
"loss": 1.6863,
"step": 950
},
{
"epoch": 0.28866292305357416,
"grad_norm": 0.40047594904899597,
"learning_rate": 9.52364078161385e-05,
"loss": 1.908,
"step": 951
},
{
"epoch": 0.2889664592502656,
"grad_norm": 0.3757762610912323,
"learning_rate": 9.52313455502683e-05,
"loss": 1.6235,
"step": 952
},
{
"epoch": 0.28926999544695703,
"grad_norm": 0.4337126612663269,
"learning_rate": 9.52262832843981e-05,
"loss": 1.6229,
"step": 953
},
{
"epoch": 0.2895735316436485,
"grad_norm": 0.4407886564731598,
"learning_rate": 9.522122101852789e-05,
"loss": 1.875,
"step": 954
},
{
"epoch": 0.28987706784033995,
"grad_norm": 0.5278657674789429,
"learning_rate": 9.521615875265768e-05,
"loss": 1.7199,
"step": 955
},
{
"epoch": 0.29018060403703144,
"grad_norm": 0.4441334307193756,
"learning_rate": 9.521109648678749e-05,
"loss": 1.1319,
"step": 956
},
{
"epoch": 0.2904841402337229,
"grad_norm": 0.3992663025856018,
"learning_rate": 9.520603422091729e-05,
"loss": 1.6948,
"step": 957
},
{
"epoch": 0.2907876764304143,
"grad_norm": 0.3979544937610626,
"learning_rate": 9.520097195504708e-05,
"loss": 1.8689,
"step": 958
},
{
"epoch": 0.2910912126271058,
"grad_norm": 0.4011298418045044,
"learning_rate": 9.519590968917688e-05,
"loss": 1.9491,
"step": 959
},
{
"epoch": 0.29139474882379723,
"grad_norm": 0.4377354383468628,
"learning_rate": 9.519084742330667e-05,
"loss": 1.7274,
"step": 960
},
{
"epoch": 0.29169828502048867,
"grad_norm": 0.5056617856025696,
"learning_rate": 9.518578515743647e-05,
"loss": 2.006,
"step": 961
},
{
"epoch": 0.29200182121718016,
"grad_norm": 0.36736002564430237,
"learning_rate": 9.518072289156626e-05,
"loss": 1.6558,
"step": 962
},
{
"epoch": 0.2923053574138716,
"grad_norm": 0.37966540455818176,
"learning_rate": 9.517566062569606e-05,
"loss": 2.0098,
"step": 963
},
{
"epoch": 0.2926088936105631,
"grad_norm": 0.4026505947113037,
"learning_rate": 9.517059835982585e-05,
"loss": 1.868,
"step": 964
},
{
"epoch": 0.2929124298072545,
"grad_norm": 0.461910218000412,
"learning_rate": 9.516553609395566e-05,
"loss": 2.1131,
"step": 965
},
{
"epoch": 0.29321596600394595,
"grad_norm": 0.4329175651073456,
"learning_rate": 9.516047382808545e-05,
"loss": 2.0068,
"step": 966
},
{
"epoch": 0.29351950220063744,
"grad_norm": 0.7611956000328064,
"learning_rate": 9.515541156221526e-05,
"loss": 1.9177,
"step": 967
},
{
"epoch": 0.2938230383973289,
"grad_norm": 0.6180218458175659,
"learning_rate": 9.515034929634506e-05,
"loss": 1.5603,
"step": 968
},
{
"epoch": 0.29412657459402036,
"grad_norm": 0.6556726694107056,
"learning_rate": 9.514528703047485e-05,
"loss": 2.1081,
"step": 969
},
{
"epoch": 0.2944301107907118,
"grad_norm": 0.3379404842853546,
"learning_rate": 9.514022476460465e-05,
"loss": 1.9701,
"step": 970
},
{
"epoch": 0.29473364698740323,
"grad_norm": 0.42676112055778503,
"learning_rate": 9.513516249873444e-05,
"loss": 1.6116,
"step": 971
},
{
"epoch": 0.2950371831840947,
"grad_norm": 0.35374894738197327,
"learning_rate": 9.513010023286424e-05,
"loss": 2.0621,
"step": 972
},
{
"epoch": 0.29534071938078615,
"grad_norm": 0.33012476563453674,
"learning_rate": 9.512503796699403e-05,
"loss": 1.4534,
"step": 973
},
{
"epoch": 0.2956442555774776,
"grad_norm": 0.37993383407592773,
"learning_rate": 9.511997570112383e-05,
"loss": 1.6306,
"step": 974
},
{
"epoch": 0.2959477917741691,
"grad_norm": 0.47140204906463623,
"learning_rate": 9.511491343525362e-05,
"loss": 2.0465,
"step": 975
},
{
"epoch": 0.2962513279708605,
"grad_norm": 0.40235936641693115,
"learning_rate": 9.510985116938343e-05,
"loss": 1.8247,
"step": 976
},
{
"epoch": 0.296554864167552,
"grad_norm": 0.3992665112018585,
"learning_rate": 9.510478890351322e-05,
"loss": 1.5702,
"step": 977
},
{
"epoch": 0.29685840036424344,
"grad_norm": 0.4469521641731262,
"learning_rate": 9.509972663764302e-05,
"loss": 1.8811,
"step": 978
},
{
"epoch": 0.29716193656093487,
"grad_norm": 0.41400644183158875,
"learning_rate": 9.509466437177281e-05,
"loss": 1.5374,
"step": 979
},
{
"epoch": 0.29746547275762636,
"grad_norm": 0.36348387598991394,
"learning_rate": 9.508960210590261e-05,
"loss": 1.9022,
"step": 980
},
{
"epoch": 0.2977690089543178,
"grad_norm": 0.4069242477416992,
"learning_rate": 9.50845398400324e-05,
"loss": 2.0066,
"step": 981
},
{
"epoch": 0.2980725451510093,
"grad_norm": 0.3684113323688507,
"learning_rate": 9.50794775741622e-05,
"loss": 1.8972,
"step": 982
},
{
"epoch": 0.2983760813477007,
"grad_norm": 0.40827688574790955,
"learning_rate": 9.5074415308292e-05,
"loss": 2.0659,
"step": 983
},
{
"epoch": 0.29867961754439215,
"grad_norm": 0.32065409421920776,
"learning_rate": 9.506935304242179e-05,
"loss": 2.0008,
"step": 984
},
{
"epoch": 0.29898315374108364,
"grad_norm": 0.38805294036865234,
"learning_rate": 9.50642907765516e-05,
"loss": 1.5027,
"step": 985
},
{
"epoch": 0.2992866899377751,
"grad_norm": 0.3656708896160126,
"learning_rate": 9.505922851068139e-05,
"loss": 1.7931,
"step": 986
},
{
"epoch": 0.2995902261344665,
"grad_norm": 0.4354289770126343,
"learning_rate": 9.505416624481119e-05,
"loss": 2.1183,
"step": 987
},
{
"epoch": 0.299893762331158,
"grad_norm": 0.3970641493797302,
"learning_rate": 9.504910397894098e-05,
"loss": 1.8188,
"step": 988
},
{
"epoch": 0.30019729852784943,
"grad_norm": 0.35527995228767395,
"learning_rate": 9.504404171307078e-05,
"loss": 1.6329,
"step": 989
},
{
"epoch": 0.3005008347245409,
"grad_norm": 0.4018630385398865,
"learning_rate": 9.503897944720057e-05,
"loss": 1.993,
"step": 990
},
{
"epoch": 0.30080437092123236,
"grad_norm": 0.36514052748680115,
"learning_rate": 9.503391718133037e-05,
"loss": 2.0482,
"step": 991
},
{
"epoch": 0.3011079071179238,
"grad_norm": 0.3790993094444275,
"learning_rate": 9.502885491546016e-05,
"loss": 2.0286,
"step": 992
},
{
"epoch": 0.3014114433146153,
"grad_norm": 0.314779669046402,
"learning_rate": 9.502379264958995e-05,
"loss": 1.8135,
"step": 993
},
{
"epoch": 0.3017149795113067,
"grad_norm": 0.42383378744125366,
"learning_rate": 9.501873038371975e-05,
"loss": 1.8783,
"step": 994
},
{
"epoch": 0.3020185157079982,
"grad_norm": 0.4036683738231659,
"learning_rate": 9.501366811784956e-05,
"loss": 1.6091,
"step": 995
},
{
"epoch": 0.30232205190468964,
"grad_norm": 0.3611324429512024,
"learning_rate": 9.500860585197935e-05,
"loss": 1.3388,
"step": 996
},
{
"epoch": 0.30262558810138107,
"grad_norm": 0.44210389256477356,
"learning_rate": 9.500354358610915e-05,
"loss": 1.6133,
"step": 997
},
{
"epoch": 0.30292912429807256,
"grad_norm": 0.37780526280403137,
"learning_rate": 9.499848132023894e-05,
"loss": 1.9993,
"step": 998
},
{
"epoch": 0.303232660494764,
"grad_norm": 0.469959557056427,
"learning_rate": 9.499341905436874e-05,
"loss": 1.8094,
"step": 999
},
{
"epoch": 0.30353619669145543,
"grad_norm": 0.38992664217948914,
"learning_rate": 9.498835678849853e-05,
"loss": 1.8975,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 19764,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.448897543149158e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}