Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

config.json +27 -0
generation_config.json +7 -0
model.safetensors +3 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +70 -0
trainer_state.json +1572 -0

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "/nlsasfs/home/aipsc/myksingh/SIDDHESH/dpp/dpp/hindi_model/",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 7168,
+  "max_position_embeddings": 131072,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 512,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.1",
+  "use_cache": true,
+  "vocab_size": 30000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.41.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8a70f0d9e0f7f0e737284e8d60c3c64dc5e81280d2bd74d15c0d10ce0220ca1
+size 3981468040

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<cls>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<sep>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "<sep>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1572 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.47914153807761095,
+  "eval_steps": 3000,
+  "global_step": 10800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002218247861470421,
+      "grad_norm": 0.15810145437717438,
+      "learning_rate": 2e-05,
+      "loss": 2.8627,
+      "step": 50
+    },
+    {
+      "epoch": 0.004436495722940842,
+      "grad_norm": 0.1590433567762375,
+      "learning_rate": 4e-05,
+      "loss": 2.8607,
+      "step": 100
+    },
+    {
+      "epoch": 0.006654743584411263,
+      "grad_norm": 0.15798641741275787,
+      "learning_rate": 6e-05,
+      "loss": 2.8623,
+      "step": 150
+    },
+    {
+      "epoch": 0.008872991445881684,
+      "grad_norm": 0.16127805411815643,
+      "learning_rate": 8e-05,
+      "loss": 2.8608,
+      "step": 200
+    },
+    {
+      "epoch": 0.011091239307352105,
+      "grad_norm": 0.1587396264076233,
+      "learning_rate": 0.0001,
+      "loss": 2.8608,
+      "step": 250
+    },
+    {
+      "epoch": 0.013309487168822525,
+      "grad_norm": 0.160736083984375,
+      "learning_rate": 0.00012,
+      "loss": 2.8563,
+      "step": 300
+    },
+    {
+      "epoch": 0.015527735030292948,
+      "grad_norm": 0.16256989538669586,
+      "learning_rate": 0.00014,
+      "loss": 2.8549,
+      "step": 350
+    },
+    {
+      "epoch": 0.01774598289176337,
+      "grad_norm": 0.16194568574428558,
+      "learning_rate": 0.00016,
+      "loss": 2.8557,
+      "step": 400
+    },
+    {
+      "epoch": 0.01996423075323379,
+      "grad_norm": 0.15836463868618011,
+      "learning_rate": 0.00018,
+      "loss": 2.8545,
+      "step": 450
+    },
+    {
+      "epoch": 0.02218247861470421,
+      "grad_norm": 0.16059577465057373,
+      "learning_rate": 0.0002,
+      "loss": 2.8522,
+      "step": 500
+    },
+    {
+      "epoch": 0.024400726476174632,
+      "grad_norm": 0.16031378507614136,
+      "learning_rate": 0.00022000000000000003,
+      "loss": 2.8481,
+      "step": 550
+    },
+    {
+      "epoch": 0.02661897433764505,
+      "grad_norm": 0.16000501811504364,
+      "learning_rate": 0.00024,
+      "loss": 2.8431,
+      "step": 600
+    },
+    {
+      "epoch": 0.028837222199115473,
+      "grad_norm": 0.15952646732330322,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 2.8475,
+      "step": 650
+    },
+    {
+      "epoch": 0.031055470060585896,
+      "grad_norm": 0.16443726420402527,
+      "learning_rate": 0.00028,
+      "loss": 2.8452,
+      "step": 700
+    },
+    {
+      "epoch": 0.033273717922056315,
+      "grad_norm": 0.1644088476896286,
+      "learning_rate": 0.00030000000000000003,
+      "loss": 2.8458,
+      "step": 750
+    },
+    {
+      "epoch": 0.03549196578352674,
+      "grad_norm": 0.16272033751010895,
+      "learning_rate": 0.00032,
+      "loss": 2.8435,
+      "step": 800
+    },
+    {
+      "epoch": 0.03771021364499716,
+      "grad_norm": 0.16485804319381714,
+      "learning_rate": 0.00034,
+      "loss": 2.8481,
+      "step": 850
+    },
+    {
+      "epoch": 0.03992846150646758,
+      "grad_norm": 0.1669188290834427,
+      "learning_rate": 0.00036,
+      "loss": 2.8555,
+      "step": 900
+    },
+    {
+      "epoch": 0.042146709367938,
+      "grad_norm": 0.16288943588733673,
+      "learning_rate": 0.00038,
+      "loss": 2.851,
+      "step": 950
+    },
+    {
+      "epoch": 0.04436495722940842,
+      "grad_norm": 0.1651136726140976,
+      "learning_rate": 0.0004,
+      "loss": 2.8443,
+      "step": 1000
+    },
+    {
+      "epoch": 0.04658320509087884,
+      "grad_norm": 0.16190673410892487,
+      "learning_rate": 0.00039999468202328424,
+      "loss": 2.8398,
+      "step": 1050
+    },
+    {
+      "epoch": 0.048801452952349264,
+      "grad_norm": 0.1649934947490692,
+      "learning_rate": 0.00039997872837594555,
+      "loss": 2.8371,
+      "step": 1100
+    },
+    {
+      "epoch": 0.051019700813819686,
+      "grad_norm": 0.16184477508068085,
+      "learning_rate": 0.00039995213990639536,
+      "loss": 2.8347,
+      "step": 1150
+    },
+    {
+      "epoch": 0.0532379486752901,
+      "grad_norm": 0.1629864126443863,
+      "learning_rate": 0.0003999149180286022,
+      "loss": 2.834,
+      "step": 1200
+    },
+    {
+      "epoch": 0.055456196536760524,
+      "grad_norm": 0.1627526730298996,
+      "learning_rate": 0.00039986706472201685,
+      "loss": 2.8309,
+      "step": 1250
+    },
+    {
+      "epoch": 0.05767444439823095,
+      "grad_norm": 0.1642647087574005,
+      "learning_rate": 0.000399808582531467,
+      "loss": 2.8352,
+      "step": 1300
+    },
+    {
+      "epoch": 0.05989269225970137,
+      "grad_norm": 0.16397783160209656,
+      "learning_rate": 0.000399739474567022,
+      "loss": 2.8317,
+      "step": 1350
+    },
+    {
+      "epoch": 0.06211094012117179,
+      "grad_norm": 0.16319701075553894,
+      "learning_rate": 0.00039965974450382726,
+      "loss": 2.8322,
+      "step": 1400
+    },
+    {
+      "epoch": 0.06432918798264221,
+      "grad_norm": 0.16067005693912506,
+      "learning_rate": 0.000399569396581909,
+      "loss": 2.8279,
+      "step": 1450
+    },
+    {
+      "epoch": 0.06654743584411263,
+      "grad_norm": 0.16118553280830383,
+      "learning_rate": 0.00039946843560594866,
+      "loss": 2.8323,
+      "step": 1500
+    },
+    {
+      "epoch": 0.06876568370558306,
+      "grad_norm": 0.16291728615760803,
+      "learning_rate": 0.0003993568669450274,
+      "loss": 2.8301,
+      "step": 1550
+    },
+    {
+      "epoch": 0.07098393156705347,
+      "grad_norm": 0.1590035855770111,
+      "learning_rate": 0.0003992346965323407,
+      "loss": 2.8214,
+      "step": 1600
+    },
+    {
+      "epoch": 0.07320217942852389,
+      "grad_norm": 0.16236472129821777,
+      "learning_rate": 0.00039910193086488253,
+      "loss": 2.8242,
+      "step": 1650
+    },
+    {
+      "epoch": 0.07542042728999432,
+      "grad_norm": 0.1617489606142044,
+      "learning_rate": 0.0003989585770031003,
+      "loss": 2.8231,
+      "step": 1700
+    },
+    {
+      "epoch": 0.07763867515146473,
+      "grad_norm": 0.15960238873958588,
+      "learning_rate": 0.000398804642570519,
+      "loss": 2.8248,
+      "step": 1750
+    },
+    {
+      "epoch": 0.07985692301293516,
+      "grad_norm": 0.16391754150390625,
+      "learning_rate": 0.0003986401357533358,
+      "loss": 2.8222,
+      "step": 1800
+    },
+    {
+      "epoch": 0.08207517087440558,
+      "grad_norm": 0.16161847114562988,
+      "learning_rate": 0.000398465065299985,
+      "loss": 2.8153,
+      "step": 1850
+    },
+    {
+      "epoch": 0.084293418735876,
+      "grad_norm": 0.16447125375270844,
+      "learning_rate": 0.00039827944052067265,
+      "loss": 2.818,
+      "step": 1900
+    },
+    {
+      "epoch": 0.08651166659734642,
+      "grad_norm": 0.16384591162204742,
+      "learning_rate": 0.0003980832712868812,
+      "loss": 2.8093,
+      "step": 1950
+    },
+    {
+      "epoch": 0.08872991445881684,
+      "grad_norm": 0.16317427158355713,
+      "learning_rate": 0.0003978765680308447,
+      "loss": 2.8113,
+      "step": 2000
+    },
+    {
+      "epoch": 0.09094816232028727,
+      "grad_norm": 0.16197824478149414,
+      "learning_rate": 0.00039765934174499436,
+      "loss": 2.8134,
+      "step": 2050
+    },
+    {
+      "epoch": 0.09316641018175768,
+      "grad_norm": 0.16196754574775696,
+      "learning_rate": 0.00039743160398137344,
+      "loss": 2.8147,
+      "step": 2100
+    },
+    {
+      "epoch": 0.0953846580432281,
+      "grad_norm": 0.16696424782276154,
+      "learning_rate": 0.00039719336685102314,
+      "loss": 2.811,
+      "step": 2150
+    },
+    {
+      "epoch": 0.09760290590469853,
+      "grad_norm": 0.16266262531280518,
+      "learning_rate": 0.0003969446430233386,
+      "loss": 2.8103,
+      "step": 2200
+    },
+    {
+      "epoch": 0.09982115376616894,
+      "grad_norm": 0.16161397099494934,
+      "learning_rate": 0.0003966854457253951,
+      "loss": 2.8017,
+      "step": 2250
+    },
+    {
+      "epoch": 0.10203940162763937,
+      "grad_norm": 0.1631053388118744,
+      "learning_rate": 0.0003964157887412445,
+      "loss": 2.8034,
+      "step": 2300
+    },
+    {
+      "epoch": 0.10425764948910979,
+      "grad_norm": 0.16185788810253143,
+      "learning_rate": 0.00039613568641118255,
+      "loss": 2.8027,
+      "step": 2350
+    },
+    {
+      "epoch": 0.1064758973505802,
+      "grad_norm": 0.16428661346435547,
+      "learning_rate": 0.00039584515363098584,
+      "loss": 2.8031,
+      "step": 2400
+    },
+    {
+      "epoch": 0.10869414521205063,
+      "grad_norm": 0.1625480055809021,
+      "learning_rate": 0.00039554420585112,
+      "loss": 2.7968,
+      "step": 2450
+    },
+    {
+      "epoch": 0.11091239307352105,
+      "grad_norm": 0.1638619303703308,
+      "learning_rate": 0.0003952328590759179,
+      "loss": 2.8007,
+      "step": 2500
+    },
+    {
+      "epoch": 0.11313064093499148,
+      "grad_norm": 0.16504357755184174,
+      "learning_rate": 0.0003949111298627286,
+      "loss": 2.7921,
+      "step": 2550
+    },
+    {
+      "epoch": 0.1153488887964619,
+      "grad_norm": 0.16429375112056732,
+      "learning_rate": 0.0003945790353210367,
+      "loss": 2.7951,
+      "step": 2600
+    },
+    {
+      "epoch": 0.11756713665793232,
+      "grad_norm": 0.166097030043602,
+      "learning_rate": 0.0003942365931115526,
+      "loss": 2.7948,
+      "step": 2650
+    },
+    {
+      "epoch": 0.11978538451940274,
+      "grad_norm": 0.16275139153003693,
+      "learning_rate": 0.0003938838214452733,
+      "loss": 2.79,
+      "step": 2700
+    },
+    {
+      "epoch": 0.12200363238087315,
+      "grad_norm": 0.16379590332508087,
+      "learning_rate": 0.0003935207390825137,
+      "loss": 2.7896,
+      "step": 2750
+    },
+    {
+      "epoch": 0.12422188024234358,
+      "grad_norm": 0.16332408785820007,
+      "learning_rate": 0.0003931473653319095,
+      "loss": 2.7848,
+      "step": 2800
+    },
+    {
+      "epoch": 0.126440128103814,
+      "grad_norm": 0.16235879063606262,
+      "learning_rate": 0.00039276372004938987,
+      "loss": 2.7836,
+      "step": 2850
+    },
+    {
+      "epoch": 0.12865837596528443,
+      "grad_norm": 0.1654053032398224,
+      "learning_rate": 0.00039236982363712145,
+      "loss": 2.7845,
+      "step": 2900
+    },
+    {
+      "epoch": 0.13087662382675483,
+      "grad_norm": 0.16393068432807922,
+      "learning_rate": 0.00039196569704242376,
+      "loss": 2.7796,
+      "step": 2950
+    },
+    {
+      "epoch": 0.13309487168822526,
+      "grad_norm": 0.16517628729343414,
+      "learning_rate": 0.0003915513617566551,
+      "loss": 2.7738,
+      "step": 3000
+    },
+    {
+      "epoch": 0.13309487168822526,
+      "eval_accuracy": 0.4247227650219834,
+      "eval_loss": 2.8996665477752686,
+      "eval_runtime": 243.2366,
+      "eval_samples_per_second": 8.222,
+      "eval_steps_per_second": 1.028,
+      "step": 3000
+    },
+    {
+      "epoch": 0.1353131195496957,
+      "grad_norm": 0.16540038585662842,
+      "learning_rate": 0.00039112683981406936,
+      "loss": 2.7708,
+      "step": 3050
+    },
+    {
+      "epoch": 0.13753136741116612,
+      "grad_norm": 0.16403205692768097,
+      "learning_rate": 0.00039069215379064465,
+      "loss": 2.7709,
+      "step": 3100
+    },
+    {
+      "epoch": 0.13974961527263652,
+      "grad_norm": 0.16498889029026031,
+      "learning_rate": 0.0003902473268028826,
+      "loss": 2.7683,
+      "step": 3150
+    },
+    {
+      "epoch": 0.14196786313410695,
+      "grad_norm": 0.16713927686214447,
+      "learning_rate": 0.00038979238250657863,
+      "loss": 2.7578,
+      "step": 3200
+    },
+    {
+      "epoch": 0.14418611099557738,
+      "grad_norm": 0.16905058920383453,
+      "learning_rate": 0.00038932734509556467,
+      "loss": 2.7602,
+      "step": 3250
+    },
+    {
+      "epoch": 0.14640435885704778,
+      "grad_norm": 0.16431044042110443,
+      "learning_rate": 0.0003888522393004219,
+      "loss": 2.7685,
+      "step": 3300
+    },
+    {
+      "epoch": 0.1486226067185182,
+      "grad_norm": 0.163705512881279,
+      "learning_rate": 0.00038836709038716583,
+      "loss": 2.8434,
+      "step": 3350
+    },
+    {
+      "epoch": 0.15084085457998864,
+      "grad_norm": 0.1622520387172699,
+      "learning_rate": 0.0003878719241559027,
+      "loss": 2.8349,
+      "step": 3400
+    },
+    {
+      "epoch": 0.15305910244145907,
+      "grad_norm": 0.16072827577590942,
+      "learning_rate": 0.00038736676693945746,
+      "loss": 2.8369,
+      "step": 3450
+    },
+    {
+      "epoch": 0.15527735030292947,
+      "grad_norm": 0.16206832230091095,
+      "learning_rate": 0.0003868516456019733,
+      "loss": 2.8404,
+      "step": 3500
+    },
+    {
+      "epoch": 0.1574955981643999,
+      "grad_norm": 0.16148249804973602,
+      "learning_rate": 0.0003863265875374829,
+      "loss": 2.836,
+      "step": 3550
+    },
+    {
+      "epoch": 0.15971384602587033,
+      "grad_norm": 0.16401226818561554,
+      "learning_rate": 0.0003857916206684519,
+      "loss": 2.8369,
+      "step": 3600
+    },
+    {
+      "epoch": 0.16193209388734073,
+      "grad_norm": 0.15987250208854675,
+      "learning_rate": 0.00038524677344429386,
+      "loss": 2.8363,
+      "step": 3650
+    },
+    {
+      "epoch": 0.16415034174881116,
+      "grad_norm": 0.16117645800113678,
+      "learning_rate": 0.00038469207483985725,
+      "loss": 2.8426,
+      "step": 3700
+    },
+    {
+      "epoch": 0.1663685896102816,
+      "grad_norm": 0.16374363005161285,
+      "learning_rate": 0.00038412755435388474,
+      "loss": 2.8416,
+      "step": 3750
+    },
+    {
+      "epoch": 0.168586837471752,
+      "grad_norm": 0.16495129466056824,
+      "learning_rate": 0.0003835532420074444,
+      "loss": 2.8396,
+      "step": 3800
+    },
+    {
+      "epoch": 0.17080508533322242,
+      "grad_norm": 0.16315814852714539,
+      "learning_rate": 0.0003829691683423329,
+      "loss": 2.8358,
+      "step": 3850
+    },
+    {
+      "epoch": 0.17302333319469285,
+      "grad_norm": 0.16098596155643463,
+      "learning_rate": 0.00038237536441945193,
+      "loss": 2.8354,
+      "step": 3900
+    },
+    {
+      "epoch": 0.17524158105616328,
+      "grad_norm": 0.16226187348365784,
+      "learning_rate": 0.00038177186181715577,
+      "loss": 2.8352,
+      "step": 3950
+    },
+    {
+      "epoch": 0.17745982891763368,
+      "grad_norm": 0.15939714014530182,
+      "learning_rate": 0.00038115869262957233,
+      "loss": 2.835,
+      "step": 4000
+    },
+    {
+      "epoch": 0.1796780767791041,
+      "grad_norm": 0.1622256189584732,
+      "learning_rate": 0.00038053588946489615,
+      "loss": 2.8391,
+      "step": 4050
+    },
+    {
+      "epoch": 0.18189632464057454,
+      "grad_norm": 0.16176052391529083,
+      "learning_rate": 0.0003799034854436545,
+      "loss": 2.8371,
+      "step": 4100
+    },
+    {
+      "epoch": 0.18411457250204494,
+      "grad_norm": 0.16316720843315125,
+      "learning_rate": 0.0003792615141969462,
+      "loss": 2.8365,
+      "step": 4150
+    },
+    {
+      "epoch": 0.18633282036351537,
+      "grad_norm": 0.16207610070705414,
+      "learning_rate": 0.0003786100098646524,
+      "loss": 2.8346,
+      "step": 4200
+    },
+    {
+      "epoch": 0.1885510682249858,
+      "grad_norm": 0.1638234406709671,
+      "learning_rate": 0.000377949007093622,
+      "loss": 2.8319,
+      "step": 4250
+    },
+    {
+      "epoch": 0.1907693160864562,
+      "grad_norm": 0.1628599315881729,
+      "learning_rate": 0.0003772785410358283,
+      "loss": 2.8369,
+      "step": 4300
+    },
+    {
+      "epoch": 0.19298756394792663,
+      "grad_norm": 0.1653933823108673,
+      "learning_rate": 0.00037659864734650026,
+      "loss": 2.8304,
+      "step": 4350
+    },
+    {
+      "epoch": 0.19520581180939706,
+      "grad_norm": 0.16370812058448792,
+      "learning_rate": 0.0003759093621822259,
+      "loss": 2.8369,
+      "step": 4400
+    },
+    {
+      "epoch": 0.19742405967086749,
+      "grad_norm": 0.1629050225019455,
+      "learning_rate": 0.0003752107221990298,
+      "loss": 2.8339,
+      "step": 4450
+    },
+    {
+      "epoch": 0.1996423075323379,
+      "grad_norm": 0.1610899269580841,
+      "learning_rate": 0.00037450276455042354,
+      "loss": 2.829,
+      "step": 4500
+    },
+    {
+      "epoch": 0.20186055539380832,
+      "grad_norm": 0.1629941165447235,
+      "learning_rate": 0.00037378552688543005,
+      "loss": 2.8351,
+      "step": 4550
+    },
+    {
+      "epoch": 0.20407880325527875,
+      "grad_norm": 0.16439735889434814,
+      "learning_rate": 0.0003730590473465814,
+      "loss": 2.8316,
+      "step": 4600
+    },
+    {
+      "epoch": 0.20629705111674915,
+      "grad_norm": 0.16572241485118866,
+      "learning_rate": 0.00037232336456789023,
+      "loss": 2.8335,
+      "step": 4650
+    },
+    {
+      "epoch": 0.20851529897821958,
+      "grad_norm": 0.16380038857460022,
+      "learning_rate": 0.00037157851767279543,
+      "loss": 2.8286,
+      "step": 4700
+    },
+    {
+      "epoch": 0.21073354683969,
+      "grad_norm": 0.16284549236297607,
+      "learning_rate": 0.00037082454627208156,
+      "loss": 2.8301,
+      "step": 4750
+    },
+    {
+      "epoch": 0.2129517947011604,
+      "grad_norm": 0.16597482562065125,
+      "learning_rate": 0.0003700614904617721,
+      "loss": 2.8323,
+      "step": 4800
+    },
+    {
+      "epoch": 0.21517004256263084,
+      "grad_norm": 0.16378666460514069,
+      "learning_rate": 0.0003692893908209973,
+      "loss": 2.8299,
+      "step": 4850
+    },
+    {
+      "epoch": 0.21738829042410127,
+      "grad_norm": 0.1630343496799469,
+      "learning_rate": 0.0003685082884098363,
+      "loss": 2.8333,
+      "step": 4900
+    },
+    {
+      "epoch": 0.2196065382855717,
+      "grad_norm": 0.16490814089775085,
+      "learning_rate": 0.00036771822476713346,
+      "loss": 2.8307,
+      "step": 4950
+    },
+    {
+      "epoch": 0.2218247861470421,
+      "grad_norm": 0.1655721366405487,
+      "learning_rate": 0.00036691924190828935,
+      "loss": 2.8301,
+      "step": 5000
+    },
+    {
+      "epoch": 0.22404303400851253,
+      "grad_norm": 0.16776245832443237,
+      "learning_rate": 0.0003661113823230264,
+      "loss": 2.8228,
+      "step": 5050
+    },
+    {
+      "epoch": 0.22626128186998296,
+      "grad_norm": 0.1626349687576294,
+      "learning_rate": 0.00036529468897312926,
+      "loss": 2.8262,
+      "step": 5100
+    },
+    {
+      "epoch": 0.22847952973145336,
+      "grad_norm": 0.16331753134727478,
+      "learning_rate": 0.00036446920529016,
+      "loss": 2.8282,
+      "step": 5150
+    },
+    {
+      "epoch": 0.2306977775929238,
+      "grad_norm": 0.1676001250743866,
+      "learning_rate": 0.00036363497517314877,
+      "loss": 2.8313,
+      "step": 5200
+    },
+    {
+      "epoch": 0.23291602545439422,
+      "grad_norm": 0.16441357135772705,
+      "learning_rate": 0.000362792042986259,
+      "loss": 2.8278,
+      "step": 5250
+    },
+    {
+      "epoch": 0.23513427331586464,
+      "grad_norm": 0.16601622104644775,
+      "learning_rate": 0.000361940453556428,
+      "loss": 2.8303,
+      "step": 5300
+    },
+    {
+      "epoch": 0.23735252117733505,
+      "grad_norm": 0.1679011583328247,
+      "learning_rate": 0.0003610802521709833,
+      "loss": 2.8252,
+      "step": 5350
+    },
+    {
+      "epoch": 0.23957076903880548,
+      "grad_norm": 0.1650955229997635,
+      "learning_rate": 0.0003602114845752345,
+      "loss": 2.8299,
+      "step": 5400
+    },
+    {
+      "epoch": 0.2417890169002759,
+      "grad_norm": 0.16651777923107147,
+      "learning_rate": 0.00035933419697004,
+      "loss": 2.832,
+      "step": 5450
+    },
+    {
+      "epoch": 0.2440072647617463,
+      "grad_norm": 0.166709303855896,
+      "learning_rate": 0.00035844843600935024,
+      "loss": 2.8262,
+      "step": 5500
+    },
+    {
+      "epoch": 0.24622551262321674,
+      "grad_norm": 0.16586416959762573,
+      "learning_rate": 0.000357554248797727,
+      "loss": 2.8255,
+      "step": 5550
+    },
+    {
+      "epoch": 0.24844376048468717,
+      "grad_norm": 0.1647614985704422,
+      "learning_rate": 0.00035665168288783795,
+      "loss": 2.8298,
+      "step": 5600
+    },
+    {
+      "epoch": 0.2506620083461576,
+      "grad_norm": 0.16310204565525055,
+      "learning_rate": 0.000355740786277928,
+      "loss": 2.8273,
+      "step": 5650
+    },
+    {
+      "epoch": 0.252880256207628,
+      "grad_norm": 0.1629767119884491,
+      "learning_rate": 0.00035482160740926683,
+      "loss": 2.8231,
+      "step": 5700
+    },
+    {
+      "epoch": 0.2550985040690984,
+      "grad_norm": 0.16427451372146606,
+      "learning_rate": 0.00035389419516357253,
+      "loss": 2.8188,
+      "step": 5750
+    },
+    {
+      "epoch": 0.25731675193056885,
+      "grad_norm": 0.1655891388654709,
+      "learning_rate": 0.0003529585988604125,
+      "loss": 2.8258,
+      "step": 5800
+    },
+    {
+      "epoch": 0.25953499979203926,
+      "grad_norm": 0.16402335464954376,
+      "learning_rate": 0.0003520148682545803,
+      "loss": 2.8254,
+      "step": 5850
+    },
+    {
+      "epoch": 0.26175324765350966,
+      "grad_norm": 0.1638861894607544,
+      "learning_rate": 0.0003510630535334497,
+      "loss": 2.8298,
+      "step": 5900
+    },
+    {
+      "epoch": 0.2639714955149801,
+      "grad_norm": 0.16864845156669617,
+      "learning_rate": 0.0003501032053143061,
+      "loss": 2.8238,
+      "step": 5950
+    },
+    {
+      "epoch": 0.2661897433764505,
+      "grad_norm": 0.16578635573387146,
+      "learning_rate": 0.0003491353746416541,
+      "loss": 2.8225,
+      "step": 6000
+    },
+    {
+      "epoch": 0.2661897433764505,
+      "eval_accuracy": 0.4264626282364436,
+      "eval_loss": 2.8843319416046143,
+      "eval_runtime": 242.3694,
+      "eval_samples_per_second": 8.252,
+      "eval_steps_per_second": 1.031,
+      "step": 6000
+    },
+    {
+      "epoch": 0.268407991237921,
+      "grad_norm": 0.16673897206783295,
+      "learning_rate": 0.00034815961298450377,
+      "loss": 2.823,
+      "step": 6050
+    },
+    {
+      "epoch": 0.2706262390993914,
+      "grad_norm": 0.16588376462459564,
+      "learning_rate": 0.0003471759722336326,
+      "loss": 2.8193,
+      "step": 6100
+    },
+    {
+      "epoch": 0.2728444869608618,
+      "grad_norm": 0.16813361644744873,
+      "learning_rate": 0.00034618450469882687,
+      "loss": 2.8267,
+      "step": 6150
+    },
+    {
+      "epoch": 0.27506273482233223,
+      "grad_norm": 0.16656942665576935,
+      "learning_rate": 0.0003451852631060991,
+      "loss": 2.8219,
+      "step": 6200
+    },
+    {
+      "epoch": 0.27728098268380263,
+      "grad_norm": 0.1666443794965744,
+      "learning_rate": 0.0003441783005948846,
+      "loss": 2.8233,
+      "step": 6250
+    },
+    {
+      "epoch": 0.27949923054527304,
+      "grad_norm": 0.1673704832792282,
+      "learning_rate": 0.0003431636707152152,
+      "loss": 2.824,
+      "step": 6300
+    },
+    {
+      "epoch": 0.2817174784067435,
+      "grad_norm": 0.16707104444503784,
+      "learning_rate": 0.00034214142742487177,
+      "loss": 2.8221,
+      "step": 6350
+    },
+    {
+      "epoch": 0.2839357262682139,
+      "grad_norm": 0.16775397956371307,
+      "learning_rate": 0.0003411116250865143,
+      "loss": 2.8234,
+      "step": 6400
+    },
+    {
+      "epoch": 0.2861539741296843,
+      "grad_norm": 0.16813720762729645,
+      "learning_rate": 0.0003400743184647915,
+      "loss": 2.8258,
+      "step": 6450
+    },
+    {
+      "epoch": 0.28837222199115475,
+      "grad_norm": 0.16362161934375763,
+      "learning_rate": 0.00033902956272342783,
+      "loss": 2.8232,
+      "step": 6500
+    },
+    {
+      "epoch": 0.29059046985262516,
+      "grad_norm": 0.16950780153274536,
+      "learning_rate": 0.00033797741342229054,
+      "loss": 2.821,
+      "step": 6550
+    },
+    {
+      "epoch": 0.29280871771409556,
+      "grad_norm": 0.1657160073518753,
+      "learning_rate": 0.00033691792651443435,
+      "loss": 2.8181,
+      "step": 6600
+    },
+    {
+      "epoch": 0.295026965575566,
+      "grad_norm": 0.1689310073852539,
+      "learning_rate": 0.0003358511583431264,
+      "loss": 2.8257,
+      "step": 6650
+    },
+    {
+      "epoch": 0.2972452134370364,
+      "grad_norm": 0.16674135625362396,
+      "learning_rate": 0.00033477716563884956,
+      "loss": 2.8209,
+      "step": 6700
+    },
+    {
+      "epoch": 0.2994634612985068,
+      "grad_norm": 0.16600748896598816,
+      "learning_rate": 0.00033369600551628586,
+      "loss": 2.8227,
+      "step": 6750
+    },
+    {
+      "epoch": 0.3016817091599773,
+      "grad_norm": 0.16666853427886963,
+      "learning_rate": 0.0003326077354712789,
+      "loss": 2.8199,
+      "step": 6800
+    },
+    {
+      "epoch": 0.3038999570214477,
+      "grad_norm": 0.1671936959028244,
+      "learning_rate": 0.00033151241337777624,
+      "loss": 2.82,
+      "step": 6850
+    },
+    {
+      "epoch": 0.30611820488291813,
+      "grad_norm": 0.1675061583518982,
+      "learning_rate": 0.00033041009748475166,
+      "loss": 2.8246,
+      "step": 6900
+    },
+    {
+      "epoch": 0.30833645274438853,
+      "grad_norm": 0.16512750089168549,
+      "learning_rate": 0.0003293008464131079,
+      "loss": 2.8178,
+      "step": 6950
+    },
+    {
+      "epoch": 0.31055470060585894,
+      "grad_norm": 0.1670486181974411,
+      "learning_rate": 0.0003281847191525585,
+      "loss": 2.8185,
+      "step": 7000
+    },
+    {
+      "epoch": 0.3127729484673294,
+      "grad_norm": 0.1692744940519333,
+      "learning_rate": 0.0003270617750584913,
+      "loss": 2.8184,
+      "step": 7050
+    },
+    {
+      "epoch": 0.3149911963287998,
+      "grad_norm": 0.16573506593704224,
+      "learning_rate": 0.0003259320738488119,
+      "loss": 2.823,
+      "step": 7100
+    },
+    {
+      "epoch": 0.3172094441902702,
+      "grad_norm": 0.17004618048667908,
+      "learning_rate": 0.00032479567560076745,
+      "loss": 2.8174,
+      "step": 7150
+    },
+    {
+      "epoch": 0.31942769205174065,
+      "grad_norm": 0.16867642104625702,
+      "learning_rate": 0.00032365264074775223,
+      "loss": 2.8183,
+      "step": 7200
+    },
+    {
+      "epoch": 0.32164593991321105,
+      "grad_norm": 0.16543437540531158,
+      "learning_rate": 0.00032250303007609366,
+      "loss": 2.8178,
+      "step": 7250
+    },
+    {
+      "epoch": 0.32386418777468146,
+      "grad_norm": 0.16606374084949493,
+      "learning_rate": 0.0003213469047218194,
+      "loss": 2.8182,
+      "step": 7300
+    },
+    {
+      "epoch": 0.3260824356361519,
+      "grad_norm": 0.1708928942680359,
+      "learning_rate": 0.0003201843261674067,
+      "loss": 2.8194,
+      "step": 7350
+    },
+    {
+      "epoch": 0.3283006834976223,
+      "grad_norm": 0.16661237180233002,
+      "learning_rate": 0.00031901535623851245,
+      "loss": 2.8226,
+      "step": 7400
+    },
+    {
+      "epoch": 0.3305189313590927,
+      "grad_norm": 0.16710756719112396,
+      "learning_rate": 0.0003178400571006852,
+      "loss": 2.8187,
+      "step": 7450
+    },
+    {
+      "epoch": 0.3327371792205632,
+      "grad_norm": 0.16679760813713074,
+      "learning_rate": 0.00031665849125605937,
+      "loss": 2.8163,
+      "step": 7500
+    },
+    {
+      "epoch": 0.3349554270820336,
+      "grad_norm": 0.16872857511043549,
+      "learning_rate": 0.00031547072154003154,
+      "loss": 2.8147,
+      "step": 7550
+    },
+    {
+      "epoch": 0.337173674943504,
+      "grad_norm": 0.1672954261302948,
+      "learning_rate": 0.0003142768111179187,
+      "loss": 2.8167,
+      "step": 7600
+    },
+    {
+      "epoch": 0.33939192280497443,
+      "grad_norm": 0.16654394567012787,
+      "learning_rate": 0.00031307682348159907,
+      "loss": 2.816,
+      "step": 7650
+    },
+    {
+      "epoch": 0.34161017066644483,
+      "grad_norm": 0.16810841858386993,
+      "learning_rate": 0.00031187082244613567,
+      "loss": 2.8139,
+      "step": 7700
+    },
+    {
+      "epoch": 0.34382841852791524,
+      "grad_norm": 0.1682497262954712,
+      "learning_rate": 0.00031065887214638284,
+      "loss": 2.8157,
+      "step": 7750
+    },
+    {
+      "epoch": 0.3460466663893857,
+      "grad_norm": 0.17154847085475922,
+      "learning_rate": 0.00030944103703357524,
+      "loss": 2.8143,
+      "step": 7800
+    },
+    {
+      "epoch": 0.3482649142508561,
+      "grad_norm": 0.16658836603164673,
+      "learning_rate": 0.00030821738187190075,
+      "loss": 2.8143,
+      "step": 7850
+    },
+    {
+      "epoch": 0.35048316211232655,
+      "grad_norm": 0.16820305585861206,
+      "learning_rate": 0.00030698797173505586,
+      "loss": 2.8157,
+      "step": 7900
+    },
+    {
+      "epoch": 0.35270140997379695,
+      "grad_norm": 0.16843385994434357,
+      "learning_rate": 0.0003057528720027853,
+      "loss": 2.8103,
+      "step": 7950
+    },
+    {
+      "epoch": 0.35491965783526735,
+      "grad_norm": 0.17145898938179016,
+      "learning_rate": 0.0003045121483574054,
+      "loss": 2.8161,
+      "step": 8000
+    },
+    {
+      "epoch": 0.3571379056967378,
+      "grad_norm": 0.1709701269865036,
+      "learning_rate": 0.00030326586678031066,
+      "loss": 2.8134,
+      "step": 8050
+    },
+    {
+      "epoch": 0.3593561535582082,
+      "grad_norm": 0.16859866678714752,
+      "learning_rate": 0.0003020140935484653,
+      "loss": 2.818,
+      "step": 8100
+    },
+    {
+      "epoch": 0.3615744014196786,
+      "grad_norm": 0.16738031804561615,
+      "learning_rate": 0.00030075689523087804,
+      "loss": 2.8128,
+      "step": 8150
+    },
+    {
+      "epoch": 0.36379264928114907,
+      "grad_norm": 0.1693500131368637,
+      "learning_rate": 0.00029949433868506293,
+      "loss": 2.8138,
+      "step": 8200
+    },
+    {
+      "epoch": 0.3660108971426195,
+      "grad_norm": 0.16915106773376465,
+      "learning_rate": 0.00029822649105348294,
+      "loss": 2.8209,
+      "step": 8250
+    },
+    {
+      "epoch": 0.3682291450040899,
+      "grad_norm": 0.17108069360256195,
+      "learning_rate": 0.00029695341975998006,
+      "loss": 2.8174,
+      "step": 8300
+    },
+    {
+      "epoch": 0.37044739286556033,
+      "grad_norm": 0.16659317910671234,
+      "learning_rate": 0.00029567519250618907,
+      "loss": 2.8153,
+      "step": 8350
+    },
+    {
+      "epoch": 0.37266564072703073,
+      "grad_norm": 0.16678906977176666,
+      "learning_rate": 0.0002943918772679379,
+      "loss": 2.8163,
+      "step": 8400
+    },
+    {
+      "epoch": 0.37488388858850114,
+      "grad_norm": 0.16928167641162872,
+      "learning_rate": 0.00029310354229163197,
+      "loss": 2.8165,
+      "step": 8450
+    },
+    {
+      "epoch": 0.3771021364499716,
+      "grad_norm": 0.1695391833782196,
+      "learning_rate": 0.0002918102560906254,
+      "loss": 2.8197,
+      "step": 8500
+    },
+    {
+      "epoch": 0.379320384311442,
+      "grad_norm": 0.17006346583366394,
+      "learning_rate": 0.0002905120874415772,
+      "loss": 2.8172,
+      "step": 8550
+    },
+    {
+      "epoch": 0.3815386321729124,
+      "grad_norm": 0.16821132600307465,
+      "learning_rate": 0.0002892091053807939,
+      "loss": 2.8137,
+      "step": 8600
+    },
+    {
+      "epoch": 0.38375688003438285,
+      "grad_norm": 0.17077401280403137,
+      "learning_rate": 0.000287901379200558,
+      "loss": 2.8174,
+      "step": 8650
+    },
+    {
+      "epoch": 0.38597512789585325,
+      "grad_norm": 0.17006562650203705,
+      "learning_rate": 0.0002865889784454435,
+      "loss": 2.813,
+      "step": 8700
+    },
+    {
+      "epoch": 0.3881933757573237,
+      "grad_norm": 0.16847462952136993,
+      "learning_rate": 0.0002852719729086167,
+      "loss": 2.8158,
+      "step": 8750
+    },
+    {
+      "epoch": 0.3904116236187941,
+      "grad_norm": 0.16790613532066345,
+      "learning_rate": 0.0002839504326281256,
+      "loss": 2.816,
+      "step": 8800
+    },
+    {
+      "epoch": 0.3926298714802645,
+      "grad_norm": 0.16898341476917267,
+      "learning_rate": 0.00028262442788317446,
+      "loss": 2.8143,
+      "step": 8850
+    },
+    {
+      "epoch": 0.39484811934173497,
+      "grad_norm": 0.17099575698375702,
+      "learning_rate": 0.00028129402919038695,
+      "loss": 2.812,
+      "step": 8900
+    },
+    {
+      "epoch": 0.3970663672032054,
+      "grad_norm": 0.17063932120800018,
+      "learning_rate": 0.00027995930730005577,
+      "loss": 2.815,
+      "step": 8950
+    },
+    {
+      "epoch": 0.3992846150646758,
+      "grad_norm": 0.1704034060239792,
+      "learning_rate": 0.00027862033319238025,
+      "loss": 2.8144,
+      "step": 9000
+    },
+    {
+      "epoch": 0.3992846150646758,
+      "eval_accuracy": 0.42786541279921836,
+      "eval_loss": 2.8759515285491943,
+      "eval_runtime": 250.6732,
+      "eval_samples_per_second": 7.979,
+      "eval_steps_per_second": 0.997,
+      "step": 9000
+    },
+    {
+      "epoch": 0.40150286292614623,
+      "grad_norm": 0.1675969511270523,
+      "learning_rate": 0.0002772771780736917,
+      "loss": 2.8128,
+      "step": 9050
+    },
+    {
+      "epoch": 0.40372111078761663,
+      "grad_norm": 0.1697956621646881,
+      "learning_rate": 0.0002759299133726665,
+      "loss": 2.8121,
+      "step": 9100
+    },
+    {
+      "epoch": 0.40593935864908703,
+      "grad_norm": 0.1710100620985031,
+      "learning_rate": 0.00027457861073652785,
+      "loss": 2.8156,
+      "step": 9150
+    },
+    {
+      "epoch": 0.4081576065105575,
+      "grad_norm": 0.16877809166908264,
+      "learning_rate": 0.00027322334202723527,
+      "loss": 2.815,
+      "step": 9200
+    },
+    {
+      "epoch": 0.4103758543720279,
+      "grad_norm": 0.17122440040111542,
+      "learning_rate": 0.0002718641793176631,
+      "loss": 2.8119,
+      "step": 9250
+    },
+    {
+      "epoch": 0.4125941022334983,
+      "grad_norm": 0.16771045327186584,
+      "learning_rate": 0.0002705011948877679,
+      "loss": 2.808,
+      "step": 9300
+    },
+    {
+      "epoch": 0.41481235009496875,
+      "grad_norm": 0.16941729187965393,
+      "learning_rate": 0.0002691344612207442,
+      "loss": 2.8121,
+      "step": 9350
+    },
+    {
+      "epoch": 0.41703059795643915,
+      "grad_norm": 0.1719992607831955,
+      "learning_rate": 0.00026776405099917014,
+      "loss": 2.8094,
+      "step": 9400
+    },
+    {
+      "epoch": 0.41924884581790955,
+      "grad_norm": 0.1693243533372879,
+      "learning_rate": 0.00026639003710114223,
+      "loss": 2.8103,
+      "step": 9450
+    },
+    {
+      "epoch": 0.42146709367938,
+      "grad_norm": 0.17014500498771667,
+      "learning_rate": 0.0002650124925963998,
+      "loss": 2.8129,
+      "step": 9500
+    },
+    {
+      "epoch": 0.4236853415408504,
+      "grad_norm": 0.1709510087966919,
+      "learning_rate": 0.00026363149074243867,
+      "loss": 2.8084,
+      "step": 9550
+    },
+    {
+      "epoch": 0.4259035894023208,
+      "grad_norm": 0.16937118768692017,
+      "learning_rate": 0.0002622471049806159,
+      "loss": 2.814,
+      "step": 9600
+    },
+    {
+      "epoch": 0.42812183726379127,
+      "grad_norm": 0.1713036149740219,
+      "learning_rate": 0.00026085940893224403,
+      "loss": 2.8162,
+      "step": 9650
+    },
+    {
+      "epoch": 0.4303400851252617,
+      "grad_norm": 0.17020347714424133,
+      "learning_rate": 0.0002594684763946758,
+      "loss": 2.8116,
+      "step": 9700
+    },
+    {
+      "epoch": 0.43255833298673213,
+      "grad_norm": 0.16786696016788483,
+      "learning_rate": 0.0002580743813373796,
+      "loss": 2.8111,
+      "step": 9750
+    },
+    {
+      "epoch": 0.43477658084820253,
+      "grad_norm": 0.17273075878620148,
+      "learning_rate": 0.00025667719789800606,
+      "loss": 2.8131,
+      "step": 9800
+    },
+    {
+      "epoch": 0.43699482870967293,
+      "grad_norm": 0.16986466944217682,
+      "learning_rate": 0.00025527700037844515,
+      "loss": 2.8139,
+      "step": 9850
+    },
+    {
+      "epoch": 0.4392130765711434,
+      "grad_norm": 0.17129731178283691,
+      "learning_rate": 0.00025387386324087494,
+      "loss": 2.8125,
+      "step": 9900
+    },
+    {
+      "epoch": 0.4414313244326138,
+      "grad_norm": 0.16890868544578552,
+      "learning_rate": 0.00025246786110380163,
+      "loss": 2.8142,
+      "step": 9950
+    },
+    {
+      "epoch": 0.4436495722940842,
+      "grad_norm": 0.17167522013187408,
+      "learning_rate": 0.00025105906873809154,
+      "loss": 2.8142,
+      "step": 10000
+    },
+    {
+      "epoch": 0.44586782015555465,
+      "grad_norm": 0.17136669158935547,
+      "learning_rate": 0.0002496475610629947,
+      "loss": 2.8112,
+      "step": 10050
+    },
+    {
+      "epoch": 0.44808606801702505,
+      "grad_norm": 0.16926760971546173,
+      "learning_rate": 0.00024823341314216056,
+      "loss": 2.8156,
+      "step": 10100
+    },
+    {
+      "epoch": 0.45030431587849545,
+      "grad_norm": 0.16898435354232788,
+      "learning_rate": 0.00024681670017964627,
+      "loss": 2.8079,
+      "step": 10150
+    },
+    {
+      "epoch": 0.4525225637399659,
+      "grad_norm": 0.17237040400505066,
+      "learning_rate": 0.0002453974975159173,
+      "loss": 2.813,
+      "step": 10200
+    },
+    {
+      "epoch": 0.4547408116014363,
+      "grad_norm": 0.16995486617088318,
+      "learning_rate": 0.00024397588062384095,
+      "loss": 2.8117,
+      "step": 10250
+    },
+    {
+      "epoch": 0.4569590594629067,
+      "grad_norm": 0.17290563881397247,
+      "learning_rate": 0.00024255192510467245,
+      "loss": 2.8121,
+      "step": 10300
+    },
+    {
+      "epoch": 0.45917730732437717,
+      "grad_norm": 0.17059782147407532,
+      "learning_rate": 0.00024112570668403472,
+      "loss": 2.8138,
+      "step": 10350
+    },
+    {
+      "epoch": 0.4613955551858476,
+      "grad_norm": 0.17196382582187653,
+      "learning_rate": 0.00023969730120789132,
+      "loss": 2.8095,
+      "step": 10400
+    },
+    {
+      "epoch": 0.463613803047318,
+      "grad_norm": 0.16942380368709564,
+      "learning_rate": 0.00023826678463851285,
+      "loss": 2.8124,
+      "step": 10450
+    },
+    {
+      "epoch": 0.46583205090878843,
+      "grad_norm": 0.17288681864738464,
+      "learning_rate": 0.00023683423305043749,
+      "loss": 2.813,
+      "step": 10500
+    },
+    {
+      "epoch": 0.46805029877025883,
+      "grad_norm": 0.17040428519248962,
+      "learning_rate": 0.00023539972262642502,
+      "loss": 2.8141,
+      "step": 10550
+    },
+    {
+      "epoch": 0.4702685466317293,
+      "grad_norm": 0.17321184277534485,
+      "learning_rate": 0.00023396332965340585,
+      "loss": 2.8146,
+      "step": 10600
+    },
+    {
+      "epoch": 0.4724867944931997,
+      "grad_norm": 0.17026926577091217,
+      "learning_rate": 0.00023252513051842373,
+      "loss": 2.8086,
+      "step": 10650
+    },
+    {
+      "epoch": 0.4747050423546701,
+      "grad_norm": 0.1710352748632431,
+      "learning_rate": 0.00023108520170457398,
+      "loss": 2.8099,
+      "step": 10700
+    },
+    {
+      "epoch": 0.47692329021614055,
+      "grad_norm": 0.17067080736160278,
+      "learning_rate": 0.00022964361978693542,
+      "loss": 2.8099,
+      "step": 10750
+    },
+    {
+      "epoch": 0.47914153807761095,
+      "grad_norm": 0.17244164645671844,
+      "learning_rate": 0.0002282004614284989,
+      "loss": 2.8054,
+      "step": 10800
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 22540,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5864483085358727e+19,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}