root commited on 24 days ago

Commit

070e573

•

1 Parent(s): 74b95fc

add ckpt27

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
checkpoint-1000/config.json +30 -0
checkpoint-1000/generation_config.json +6 -0
checkpoint-1000/latest +1 -0
checkpoint-1000/model.safetensors +3 -0
checkpoint-1000/rng_state_0.pth +0 -0
checkpoint-1000/rng_state_1.pth +0 -0
checkpoint-1000/rng_state_2.pth +0 -0
checkpoint-1000/rng_state_3.pth +0 -0
checkpoint-1000/rng_state_4.pth +0 -0
checkpoint-1000/rng_state_5.pth +0 -0
checkpoint-1000/rng_state_6.pth +0 -0
checkpoint-1000/rng_state_7.pth +0 -0
checkpoint-1000/scheduler.pt +0 -0
checkpoint-1000/special_tokens_map.json +24 -0
checkpoint-1000/tokenizer.json +0 -0
checkpoint-1000/tokenizer.model +0 -0
checkpoint-1000/tokenizer_config.json +46 -0
checkpoint-1000/trainer_state.json +733 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-1000/zero_to_fp32.py +604 -0
checkpoint-27/config.json +30 -0
checkpoint-27/generation_config.json +6 -0
checkpoint-27/model.safetensors +3 -0
checkpoint-27/special_tokens_map.json +24 -0
checkpoint-27/tokenizer.json +0 -0
checkpoint-27/tokenizer.model +3 -0
checkpoint-27/tokenizer_config.json +43 -0
checkpoint-4500/config.json +30 -0
checkpoint-4500/eval-20241021102538-11_tasks.log +277 -0
checkpoint-4500/generation_config.json +6 -0
checkpoint-4500/latest +1 -0
checkpoint-4500/model.safetensors +3 -0
checkpoint-4500/rng_state_0.pth +0 -0
checkpoint-4500/rng_state_1.pth +0 -0
checkpoint-4500/rng_state_2.pth +0 -0
checkpoint-4500/rng_state_3.pth +0 -0
checkpoint-4500/rng_state_4.pth +0 -0
checkpoint-4500/rng_state_5.pth +0 -0
checkpoint-4500/rng_state_6.pth +0 -0
checkpoint-4500/rng_state_7.pth +0 -0
checkpoint-4500/scheduler.pt +0 -0
checkpoint-4500/special_tokens_map.json +24 -0
checkpoint-4500/tokenizer.json +0 -0
checkpoint-4500/tokenizer.model +0 -0
checkpoint-4500/tokenizer_config.json +46 -0
checkpoint-4500/trainer_state.json +3215 -0
checkpoint-4500/training_args.bin +3 -0
checkpoint-4500/zero_to_fp32.py +604 -0
checkpoint-579/config.json +30 -0

.gitattributes CHANGED Viewed

@@ -1,3 +1,4 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +34,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

+<<<<<<< HEAD
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+=======
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+>>>>>>> 421f174 (Initial commit)

checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/mnt/ddn/yrm/model/MMfreeLM-370M",
+  "architectures": [
+    "HGRNBitForCausalLM"
+  ],
+  "attn_mode": "fused_recurrent",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_ratio": 1,
+  "fuse_cross_entropy": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "max_position_embeddings": 2048,
+  "model_type": "hgrn_bit",
+  "num_heads": 1,
+  "num_hidden_layers": 24,
+  "rms_norm_eps": 1e-06,
+  "share_conv_kernel": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.2",
+  "use_cache": false,
+  "use_lower_bound": true,
+  "use_short_conv": false,
+  "vocab_size": 32000
+}

checkpoint-1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.45.2"
+}

checkpoint-1000/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step1000

checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c61faaa032568d4957872d607d52905b4a706bb65c63c307c0c419a0781ec9d
+size 748256328

checkpoint-1000/rng_state_0.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/rng_state_1.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/rng_state_2.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/rng_state_3.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/rng_state_4.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/rng_state_5.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/rng_state_6.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/rng_state_7.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-1000/scheduler.pt ADDED Viewed

Binary file (1.06 kB). View file

checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/tokenizer.model ADDED Viewed

Binary file (493 kB). View file

checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "split_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,733 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.5839793281653747,
+  "eval_steps": 5000,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.025839793281653745,
+      "grad_norm": 21431.55859375,
+      "learning_rate": 3.4188034188034193e-06,
+      "loss": 5737.0758,
+      "step": 10
+    },
+    {
+      "epoch": 0.05167958656330749,
+      "grad_norm": 21565.48046875,
+      "learning_rate": 6.837606837606839e-06,
+      "loss": 5749.3406,
+      "step": 20
+    },
+    {
+      "epoch": 0.07751937984496124,
+      "grad_norm": 17265.16796875,
+      "learning_rate": 1.0256410256410256e-05,
+      "loss": 5788.193,
+      "step": 30
+    },
+    {
+      "epoch": 0.10335917312661498,
+      "grad_norm": 16618.6875,
+      "learning_rate": 1.3675213675213677e-05,
+      "loss": 5726.6566,
+      "step": 40
+    },
+    {
+      "epoch": 0.12919896640826872,
+      "grad_norm": 18382.3359375,
+      "learning_rate": 1.7094017094017095e-05,
+      "loss": 5755.7125,
+      "step": 50
+    },
+    {
+      "epoch": 0.15503875968992248,
+      "grad_norm": 1329.3785400390625,
+      "learning_rate": 2.0512820512820512e-05,
+      "loss": 3448.4984,
+      "step": 60
+    },
+    {
+      "epoch": 0.18087855297157623,
+      "grad_norm": 982.5582885742188,
+      "learning_rate": 2.393162393162393e-05,
+      "loss": 793.4248,
+      "step": 70
+    },
+    {
+      "epoch": 0.20671834625322996,
+      "grad_norm": 441.6934814453125,
+      "learning_rate": 2.7350427350427355e-05,
+      "loss": 664.0245,
+      "step": 80
+    },
+    {
+      "epoch": 0.23255813953488372,
+      "grad_norm": 384.9512634277344,
+      "learning_rate": 3.0769230769230774e-05,
+      "loss": 590.2515,
+      "step": 90
+    },
+    {
+      "epoch": 0.25839793281653745,
+      "grad_norm": 1645.2818603515625,
+      "learning_rate": 3.418803418803419e-05,
+      "loss": 540.1542,
+      "step": 100
+    },
+    {
+      "epoch": 0.2842377260981912,
+      "grad_norm": 656.8567504882812,
+      "learning_rate": 3.760683760683761e-05,
+      "loss": 540.0785,
+      "step": 110
+    },
+    {
+      "epoch": 0.31007751937984496,
+      "grad_norm": 209.0157928466797,
+      "learning_rate": 3.999918503621906e-05,
+      "loss": 500.4066,
+      "step": 120
+    },
+    {
+      "epoch": 0.3359173126614987,
+      "grad_norm": 312.04351806640625,
+      "learning_rate": 3.9984698638788994e-05,
+      "loss": 476.6278,
+      "step": 130
+    },
+    {
+      "epoch": 0.36175710594315247,
+      "grad_norm": 111.13858795166016,
+      "learning_rate": 3.995211703336012e-05,
+      "loss": 457.4648,
+      "step": 140
+    },
+    {
+      "epoch": 0.3875968992248062,
+      "grad_norm": 198.55479431152344,
+      "learning_rate": 3.9901469721049156e-05,
+      "loss": 439.1863,
+      "step": 150
+    },
+    {
+      "epoch": 0.4134366925064599,
+      "grad_norm": 179.00514221191406,
+      "learning_rate": 3.983280256062371e-05,
+      "loss": 435.1474,
+      "step": 160
+    },
+    {
+      "epoch": 0.4392764857881137,
+      "grad_norm": 204.16162109375,
+      "learning_rate": 3.9746177726979355e-05,
+      "loss": 444.1413,
+      "step": 170
+    },
+    {
+      "epoch": 0.46511627906976744,
+      "grad_norm": 232.0103759765625,
+      "learning_rate": 3.964167365484312e-05,
+      "loss": 427.4143,
+      "step": 180
+    },
+    {
+      "epoch": 0.4909560723514212,
+      "grad_norm": 75.20649719238281,
+      "learning_rate": 3.951938496775456e-05,
+      "loss": 420.458,
+      "step": 190
+    },
+    {
+      "epoch": 0.5167958656330749,
+      "grad_norm": 270.50970458984375,
+      "learning_rate": 3.937942239238855e-05,
+      "loss": 406.7704,
+      "step": 200
+    },
+    {
+      "epoch": 0.5426356589147286,
+      "grad_norm": 146.1454315185547,
+      "learning_rate": 3.92219126582975e-05,
+      "loss": 407.9724,
+      "step": 210
+    },
+    {
+      "epoch": 0.5684754521963824,
+      "grad_norm": 205.81834411621094,
+      "learning_rate": 3.904699838316363e-05,
+      "loss": 416.5542,
+      "step": 220
+    },
+    {
+      "epoch": 0.5943152454780362,
+      "grad_norm": 120.94080352783203,
+      "learning_rate": 3.885483794366543e-05,
+      "loss": 415.2502,
+      "step": 230
+    },
+    {
+      "epoch": 0.6201550387596899,
+      "grad_norm": 108.84484100341797,
+      "learning_rate": 3.86456053320749e-05,
+      "loss": 401.6582,
+      "step": 240
+    },
+    {
+      "epoch": 0.6459948320413437,
+      "grad_norm": 240.042724609375,
+      "learning_rate": 3.841948999871579e-05,
+      "loss": 398.7828,
+      "step": 250
+    },
+    {
+      "epoch": 0.6718346253229974,
+      "grad_norm": 74.43096160888672,
+      "learning_rate": 3.817669668042516e-05,
+      "loss": 389.9398,
+      "step": 260
+    },
+    {
+      "epoch": 0.6976744186046512,
+      "grad_norm": 209.9810028076172,
+      "learning_rate": 3.7917445215173765e-05,
+      "loss": 389.4235,
+      "step": 270
+    },
+    {
+      "epoch": 0.7235142118863049,
+      "grad_norm": 109.08077239990234,
+      "learning_rate": 3.7641970343013115e-05,
+      "loss": 392.0608,
+      "step": 280
+    },
+    {
+      "epoch": 0.7493540051679587,
+      "grad_norm": 109.67909240722656,
+      "learning_rate": 3.7350521493529335e-05,
+      "loss": 390.3438,
+      "step": 290
+    },
+    {
+      "epoch": 0.7751937984496124,
+      "grad_norm": 248.89028930664062,
+      "learning_rate": 3.704336255999636e-05,
+      "loss": 387.3038,
+      "step": 300
+    },
+    {
+      "epoch": 0.8010335917312662,
+      "grad_norm": 119.14262390136719,
+      "learning_rate": 3.672077166043294e-05,
+      "loss": 377.6907,
+      "step": 310
+    },
+    {
+      "epoch": 0.8268733850129198,
+      "grad_norm": 142.8026123046875,
+      "learning_rate": 3.638304088577984e-05,
+      "loss": 385.1249,
+      "step": 320
+    },
+    {
+      "epoch": 0.8527131782945736,
+      "grad_norm": 173.82833862304688,
+      "learning_rate": 3.603047603542515e-05,
+      "loss": 375.1511,
+      "step": 330
+    },
+    {
+      "epoch": 0.8785529715762274,
+      "grad_norm": 164.8625946044922,
+      "learning_rate": 3.566339634031729e-05,
+      "loss": 375.9214,
+      "step": 340
+    },
+    {
+      "epoch": 0.9043927648578811,
+      "grad_norm": 117.82234191894531,
+      "learning_rate": 3.528213417391633e-05,
+      "loss": 377.7271,
+      "step": 350
+    },
+    {
+      "epoch": 0.9302325581395349,
+      "grad_norm": 41.274600982666016,
+      "learning_rate": 3.488703475124541e-05,
+      "loss": 368.712,
+      "step": 360
+    },
+    {
+      "epoch": 0.9560723514211886,
+      "grad_norm": 138.67919921875,
+      "learning_rate": 3.4478455816314724e-05,
+      "loss": 375.8104,
+      "step": 370
+    },
+    {
+      "epoch": 0.9819121447028424,
+      "grad_norm": 61.867340087890625,
+      "learning_rate": 3.405676731820106e-05,
+      "loss": 374.8659,
+      "step": 380
+    },
+    {
+      "epoch": 1.0077519379844961,
+      "grad_norm": 168.50962829589844,
+      "learning_rate": 3.362235107607629e-05,
+      "loss": 367.1276,
+      "step": 390
+    },
+    {
+      "epoch": 1.0335917312661498,
+      "grad_norm": 94.74224853515625,
+      "learning_rate": 3.317560043348795e-05,
+      "loss": 361.7362,
+      "step": 400
+    },
+    {
+      "epoch": 1.0594315245478036,
+      "grad_norm": 57.599578857421875,
+      "learning_rate": 3.2716919902205154e-05,
+      "loss": 360.4581,
+      "step": 410
+    },
+    {
+      "epoch": 1.0852713178294573,
+      "grad_norm": 127.81239318847656,
+      "learning_rate": 3.224672479595208e-05,
+      "loss": 358.2213,
+      "step": 420
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 262.8865661621094,
+      "learning_rate": 3.176544085436091e-05,
+      "loss": 360.1062,
+      "step": 430
+    },
+    {
+      "epoch": 1.1369509043927648,
+      "grad_norm": 247.9445343017578,
+      "learning_rate": 3.127350385748453e-05,
+      "loss": 367.3566,
+      "step": 440
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 204.389892578125,
+      "learning_rate": 3.077135923121809e-05,
+      "loss": 354.9228,
+      "step": 450
+    },
+    {
+      "epoch": 1.1886304909560723,
+      "grad_norm": 260.5009460449219,
+      "learning_rate": 3.0259461643986784e-05,
+      "loss": 356.919,
+      "step": 460
+    },
+    {
+      "epoch": 1.2144702842377262,
+      "grad_norm": 149.97607421875,
+      "learning_rate": 2.9738274595064845e-05,
+      "loss": 354.5676,
+      "step": 470
+    },
+    {
+      "epoch": 1.2403100775193798,
+      "grad_norm": 194.74737548828125,
+      "learning_rate": 2.9208269994898725e-05,
+      "loss": 353.6357,
+      "step": 480
+    },
+    {
+      "epoch": 1.2661498708010335,
+      "grad_norm": 200.73731994628906,
+      "learning_rate": 2.8669927737814244e-05,
+      "loss": 363.2115,
+      "step": 490
+    },
+    {
+      "epoch": 1.2919896640826873,
+      "grad_norm": 66.00718688964844,
+      "learning_rate": 2.8123735267494826e-05,
+      "loss": 350.5789,
+      "step": 500
+    },
+    {
+      "epoch": 1.3178294573643412,
+      "grad_norm": 136.6959991455078,
+      "learning_rate": 2.7570187135624063e-05,
+      "loss": 347.9595,
+      "step": 510
+    },
+    {
+      "epoch": 1.3436692506459949,
+      "grad_norm": 40.67978286743164,
+      "learning_rate": 2.7009784554092338e-05,
+      "loss": 351.9972,
+      "step": 520
+    },
+    {
+      "epoch": 1.3695090439276485,
+      "grad_norm": 34.59364318847656,
+      "learning_rate": 2.6443034941172962e-05,
+      "loss": 349.3325,
+      "step": 530
+    },
+    {
+      "epoch": 1.3953488372093024,
+      "grad_norm": 130.43624877929688,
+      "learning_rate": 2.5870451462078697e-05,
+      "loss": 345.4973,
+      "step": 540
+    },
+    {
+      "epoch": 1.421188630490956,
+      "grad_norm": 52.83992004394531,
+      "learning_rate": 2.529255256431472e-05,
+      "loss": 350.3788,
+      "step": 550
+    },
+    {
+      "epoch": 1.4470284237726099,
+      "grad_norm": 92.32342529296875,
+      "learning_rate": 2.4709861508248688e-05,
+      "loss": 350.4281,
+      "step": 560
+    },
+    {
+      "epoch": 1.4728682170542635,
+      "grad_norm": 56.88703536987305,
+      "learning_rate": 2.4122905893323006e-05,
+      "loss": 349.739,
+      "step": 570
+    },
+    {
+      "epoch": 1.4987080103359174,
+      "grad_norm": 73.592529296875,
+      "learning_rate": 2.3532217180338283e-05,
+      "loss": 355.5978,
+      "step": 580
+    },
+    {
+      "epoch": 1.524547803617571,
+      "grad_norm": 107.73036193847656,
+      "learning_rate": 2.2938330210240424e-05,
+      "loss": 338.8647,
+      "step": 590
+    },
+    {
+      "epoch": 1.550387596899225,
+      "grad_norm": 151.14724731445312,
+      "learning_rate": 2.2341782719847292e-05,
+      "loss": 339.6862,
+      "step": 600
+    },
+    {
+      "epoch": 1.5762273901808785,
+      "grad_norm": 109.76298522949219,
+      "learning_rate": 2.174311485495317e-05,
+      "loss": 351.1045,
+      "step": 610
+    },
+    {
+      "epoch": 1.6020671834625322,
+      "grad_norm": 75.17061614990234,
+      "learning_rate": 2.1142868681252072e-05,
+      "loss": 344.3581,
+      "step": 620
+    },
+    {
+      "epoch": 1.627906976744186,
+      "grad_norm": 43.415557861328125,
+      "learning_rate": 2.0541587693522694e-05,
+      "loss": 346.1661,
+      "step": 630
+    },
+    {
+      "epoch": 1.65374677002584,
+      "grad_norm": 95.63790893554688,
+      "learning_rate": 1.99398163235193e-05,
+      "loss": 340.8094,
+      "step": 640
+    },
+    {
+      "epoch": 1.6795865633074936,
+      "grad_norm": 173.1468048095703,
+      "learning_rate": 1.9338099447014348e-05,
+      "loss": 344.9255,
+      "step": 650
+    },
+    {
+      "epoch": 1.7054263565891472,
+      "grad_norm": 185.2141571044922,
+      "learning_rate": 1.8736981890438973e-05,
+      "loss": 345.9086,
+      "step": 660
+    },
+    {
+      "epoch": 1.731266149870801,
+      "grad_norm": 185.66473388671875,
+      "learning_rate": 1.8137007937568198e-05,
+      "loss": 342.1713,
+      "step": 670
+    },
+    {
+      "epoch": 1.757105943152455,
+      "grad_norm": 135.51266479492188,
+      "learning_rate": 1.7538720836697505e-05,
+      "loss": 336.7293,
+      "step": 680
+    },
+    {
+      "epoch": 1.7829457364341086,
+      "grad_norm": 74.66438293457031,
+      "learning_rate": 1.6942662308756942e-05,
+      "loss": 340.9484,
+      "step": 690
+    },
+    {
+      "epoch": 1.8087855297157622,
+      "grad_norm": 76.51793670654297,
+      "learning_rate": 1.6349372056808196e-05,
+      "loss": 332.7376,
+      "step": 700
+    },
+    {
+      "epoch": 1.8346253229974159,
+      "grad_norm": 88.05294799804688,
+      "learning_rate": 1.5759387277368817e-05,
+      "loss": 337.1342,
+      "step": 710
+    },
+    {
+      "epoch": 1.8604651162790697,
+      "grad_norm": 82.69217681884766,
+      "learning_rate": 1.517324217400589e-05,
+      "loss": 338.4325,
+      "step": 720
+    },
+    {
+      "epoch": 1.8863049095607236,
+      "grad_norm": 49.15666580200195,
+      "learning_rate": 1.4591467473639769e-05,
+      "loss": 333.9558,
+      "step": 730
+    },
+    {
+      "epoch": 1.9121447028423773,
+      "grad_norm": 100.37080383300781,
+      "learning_rate": 1.4014589945995718e-05,
+      "loss": 339.7346,
+      "step": 740
+    },
+    {
+      "epoch": 1.937984496124031,
+      "grad_norm": 64.05115509033203,
+      "learning_rate": 1.3443131926638637e-05,
+      "loss": 336.4353,
+      "step": 750
+    },
+    {
+      "epoch": 1.9638242894056848,
+      "grad_norm": 40.96150207519531,
+      "learning_rate": 1.287761084402265e-05,
+      "loss": 344.2413,
+      "step": 760
+    },
+    {
+      "epoch": 1.9896640826873386,
+      "grad_norm": 42.53565979003906,
+      "learning_rate": 1.2318538750983903e-05,
+      "loss": 326.7869,
+      "step": 770
+    },
+    {
+      "epoch": 2.0155038759689923,
+      "grad_norm": 33.191429138183594,
+      "learning_rate": 1.1766421861100734e-05,
+      "loss": 330.7202,
+      "step": 780
+    },
+    {
+      "epoch": 2.041343669250646,
+      "grad_norm": 25.623939514160156,
+      "learning_rate": 1.1221760090340987e-05,
+      "loss": 332.5341,
+      "step": 790
+    },
+    {
+      "epoch": 2.0671834625322996,
+      "grad_norm": 41.96136474609375,
+      "learning_rate": 1.068504660441154e-05,
+      "loss": 332.9494,
+      "step": 800
+    },
+    {
+      "epoch": 2.0930232558139537,
+      "grad_norm": 27.031042098999023,
+      "learning_rate": 1.0156767372219854e-05,
+      "loss": 325.7135,
+      "step": 810
+    },
+    {
+      "epoch": 2.1188630490956073,
+      "grad_norm": 58.57283401489258,
+      "learning_rate": 9.637400725851947e-06,
+      "loss": 331.5063,
+      "step": 820
+    },
+    {
+      "epoch": 2.144702842377261,
+      "grad_norm": 54.434139251708984,
+      "learning_rate": 9.127416927465047e-06,
+      "loss": 327.2943,
+      "step": 830
+    },
+    {
+      "epoch": 2.1705426356589146,
+      "grad_norm": 36.624176025390625,
+      "learning_rate": 8.627277743487296e-06,
+      "loss": 332.8677,
+      "step": 840
+    },
+    {
+      "epoch": 2.1963824289405687,
+      "grad_norm": 53.19684600830078,
+      "learning_rate": 8.137436026509862e-06,
+      "loss": 333.1244,
+      "step": 850
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 37.776268005371094,
+      "learning_rate": 7.65833530525017e-06,
+      "loss": 329.4649,
+      "step": 860
+    },
+    {
+      "epoch": 2.248062015503876,
+      "grad_norm": 26.57048797607422,
+      "learning_rate": 7.190409382957408e-06,
+      "loss": 336.0614,
+      "step": 870
+    },
+    {
+      "epoch": 2.2739018087855296,
+      "grad_norm": 57.234336853027344,
+      "learning_rate": 6.734081944624027e-06,
+      "loss": 328.4645,
+      "step": 880
+    },
+    {
+      "epoch": 2.2997416020671837,
+      "grad_norm": 56.26626205444336,
+      "learning_rate": 6.289766173358826e-06,
+      "loss": 324.8838,
+      "step": 890
+    },
+    {
+      "epoch": 2.3255813953488373,
+      "grad_norm": 48.253318786621094,
+      "learning_rate": 5.857864376269051e-06,
+      "loss": 327.6445,
+      "step": 900
+    },
+    {
+      "epoch": 2.351421188630491,
+      "grad_norm": 28.283784866333008,
+      "learning_rate": 5.438767620190108e-06,
+      "loss": 326.632,
+      "step": 910
+    },
+    {
+      "epoch": 2.3772609819121446,
+      "grad_norm": 41.68576431274414,
+      "learning_rate": 5.032855377592904e-06,
+      "loss": 325.8222,
+      "step": 920
+    },
+    {
+      "epoch": 2.4031007751937983,
+      "grad_norm": 22.881074905395508,
+      "learning_rate": 4.64049518298932e-06,
+      "loss": 324.5381,
+      "step": 930
+    },
+    {
+      "epoch": 2.4289405684754524,
+      "grad_norm": 26.17159652709961,
+      "learning_rate": 4.262042300146898e-06,
+      "loss": 330.8671,
+      "step": 940
+    },
+    {
+      "epoch": 2.454780361757106,
+      "grad_norm": 44.048988342285156,
+      "learning_rate": 3.897839400414187e-06,
+      "loss": 326.6512,
+      "step": 950
+    },
+    {
+      "epoch": 2.4806201550387597,
+      "grad_norm": 29.078561782836914,
+      "learning_rate": 3.548216252447867e-06,
+      "loss": 321.7997,
+      "step": 960
+    },
+    {
+      "epoch": 2.5064599483204133,
+      "grad_norm": 20.772472381591797,
+      "learning_rate": 3.21348942362272e-06,
+      "loss": 331.5863,
+      "step": 970
+    },
+    {
+      "epoch": 2.532299741602067,
+      "grad_norm": 20.700204849243164,
+      "learning_rate": 2.893961993394667e-06,
+      "loss": 326.034,
+      "step": 980
+    },
+    {
+      "epoch": 2.558139534883721,
+      "grad_norm": 52.08636474609375,
+      "learning_rate": 2.5899232788765604e-06,
+      "loss": 329.6624,
+      "step": 990
+    },
+    {
+      "epoch": 2.5839793281653747,
+      "grad_norm": 26.22823715209961,
+      "learning_rate": 2.3016485728750724e-06,
+      "loss": 324.0566,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1161,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.2584857851448525e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9a1192bc455be45a1d733feace997991864076dc60e3ce9e8b39722b0631e4c
+size 6648

checkpoint-1000/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-27/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/apdcephfs_qy3/share_301069248/users/rummyyang/matmulfreellm/model/MMfreeLM-370M",
+  "architectures": [
+    "HGRNBitForCausalLM"
+  ],
+  "attn_mode": "fused_recurrent",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_ratio": 1,
+  "fuse_cross_entropy": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "max_position_embeddings": 2048,
+  "model_type": "hgrn_bit",
+  "num_heads": 1,
+  "num_hidden_layers": 24,
+  "rms_norm_eps": 1e-06,
+  "share_conv_kernel": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.45.2",
+  "use_cache": true,
+  "use_lower_bound": true,
+  "use_short_conv": false,
+  "vocab_size": 32000
+}

checkpoint-27/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.45.2"
+}

checkpoint-27/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25a683b43602104752ed2d95020d4e9964a210f1e95d0524ecd5921909e2b730
+size 1496472568

checkpoint-27/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-27/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-27/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-27/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-4500/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/usr/yrm/model/MMfreeLM-370M",
+  "architectures": [
+    "HGRNBitForCausalLM"
+  ],
+  "attn_mode": "fused_recurrent",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_ratio": 1,
+  "fuse_cross_entropy": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "max_position_embeddings": 2048,
+  "model_type": "hgrn_bit",
+  "num_heads": 1,
+  "num_hidden_layers": 24,
+  "rms_norm_eps": 1e-06,
+  "share_conv_kernel": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.2",
+  "use_cache": false,
+  "use_lower_bound": true,
+  "use_short_conv": false,
+  "vocab_size": 32000
+}

checkpoint-4500/eval-20241021102538-11_tasks.log ADDED Viewed

	@@ -0,0 +1,277 @@

+hf (pretrained=/apdcephfs_qy3/share_301069248/users/rummyyang/LLaMA-Factory/saves/llama3-1b/lora/pretrain/sft/checkpoint-4500), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 128
+|                       Tasks                        |Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|----------------------------------------------------|-------|----------------|-----:|-----------|---|-----:|---|-----:|
+|arc_challenge                                       |      1|none            |     0|acc        |↑  |0.2048|±  |0.0118|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2372|±  |0.0124|
+|arc_easy                                            |      1|none            |     0|acc        |↑  |0.4407|±  |0.0102|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.4007|±  |0.0101|
+|ceval-valid                                         |N/A    |none            |     0|acc        |↑  |0.2623|±  |0.0120|
+|ceval-valid_accountant                              |      1|none            |     0|acc        |↑  |0.2449|±  |0.0621|
+|ceval-valid_advanced_mathematics                    |      1|none            |     0|acc        |↑  |0.2105|±  |0.0961|
+|ceval-valid_art_studies                             |      1|none            |     0|acc        |↑  |0.1515|±  |0.0634|
+|ceval-valid_basic_medicine                          |      1|none            |     0|acc        |↑  |0.3684|±  |0.1137|
+|ceval-valid_business_administration                 |      1|none            |     0|acc        |↑  |0.2727|±  |0.0787|
+|ceval-valid_chinese_language_and_literature         |      1|none            |     0|acc        |↑  |0.1304|±  |0.0718|
+|ceval-valid_civil_servant                           |      1|none            |     0|acc        |↑  |0.1702|±  |0.0554|
+|ceval-valid_clinical_medicine                       |      1|none            |     0|acc        |↑  |0.2273|±  |0.0914|
+|ceval-valid_college_chemistry                       |      1|none            |     0|acc        |↑  |0.2500|±  |0.0903|
+|ceval-valid_college_economics                       |      1|none            |     0|acc        |↑  |0.3455|±  |0.0647|
+|ceval-valid_college_physics                         |      1|none            |     0|acc        |↑  |0.2632|±  |0.1038|
+|ceval-valid_college_programming                     |      1|none            |     0|acc        |↑  |0.2973|±  |0.0762|
+|ceval-valid_computer_architecture                   |      1|none            |     0|acc        |↑  |0.2857|±  |0.1010|
+|ceval-valid_computer_network                        |      1|none            |     0|acc        |↑  |0.4737|±  |0.1177|
+|ceval-valid_discrete_mathematics                    |      1|none            |     0|acc        |↑  |0.2500|±  |0.1118|
+|ceval-valid_education_science                       |      1|none            |     0|acc        |↑  |0.3448|±  |0.0898|
+|ceval-valid_electrical_engineer                     |      1|none            |     0|acc        |↑  |0.2973|±  |0.0762|
+|ceval-valid_environmental_impact_assessment_engineer|      1|none            |     0|acc        |↑  |0.2903|±  |0.0829|
+|ceval-valid_fire_engineer                           |      1|none            |     0|acc        |↑  |0.3226|±  |0.0853|
+|ceval-valid_high_school_biology                     |      1|none            |     0|acc        |↑  |0.2632|±  |0.1038|
+|ceval-valid_high_school_chemistry                   |      1|none            |     0|acc        |↑  |0.2632|±  |0.1038|
+|ceval-valid_high_school_chinese                     |      1|none            |     0|acc        |↑  |0.1579|±  |0.0859|
+|ceval-valid_high_school_geography                   |      1|none            |     0|acc        |↑  |0.2632|±  |0.1038|
+|ceval-valid_high_school_history                     |      1|none            |     0|acc        |↑  |0.5500|±  |0.1141|
+|ceval-valid_high_school_mathematics                 |      1|none            |     0|acc        |↑  |0.2222|±  |0.1008|
+|ceval-valid_high_school_physics                     |      1|none            |     0|acc        |↑  |0.2105|±  |0.0961|
+|ceval-valid_high_school_politics                    |      1|none            |     0|acc        |↑  |0.1579|±  |0.0859|
+|ceval-valid_ideological_and_moral_cultivation       |      1|none            |     0|acc        |↑  |0.3684|±  |0.1137|
+|ceval-valid_law                                     |      1|none            |     0|acc        |↑  |0.1667|±  |0.0777|
+|ceval-valid_legal_professional                      |      1|none            |     0|acc        |↑  |0.2174|±  |0.0879|
+|ceval-valid_logic                                   |      1|none            |     0|acc        |↑  |0.1818|±  |0.0842|
+|ceval-valid_mao_zedong_thought                      |      1|none            |     0|acc        |↑  |0.2500|±  |0.0903|
+|ceval-valid_marxism                                 |      1|none            |     0|acc        |↑  |0.3684|±  |0.1137|
+|ceval-valid_metrology_engineer                      |      1|none            |     0|acc        |↑  |0.1667|±  |0.0777|
+|ceval-valid_middle_school_biology                   |      1|none            |     0|acc        |↑  |0.0952|±  |0.0656|
+|ceval-valid_middle_school_chemistry                 |      1|none            |     0|acc        |↑  |0.3000|±  |0.1051|
+|ceval-valid_middle_school_geography                 |      1|none            |     0|acc        |↑  |0.2500|±  |0.1306|
+|ceval-valid_middle_school_history                   |      1|none            |     0|acc        |↑  |0.0455|±  |0.0455|
+|ceval-valid_middle_school_mathematics               |      1|none            |     0|acc        |↑  |0.2105|±  |0.0961|
+|ceval-valid_middle_school_physics                   |      1|none            |     0|acc        |↑  |0.2632|±  |0.1038|
+|ceval-valid_middle_school_politics                  |      1|none            |     0|acc        |↑  |0.2381|±  |0.0952|
+|ceval-valid_modern_chinese_history                  |      1|none            |     0|acc        |↑  |0.1739|±  |0.0808|
+|ceval-valid_operating_system                        |      1|none            |     0|acc        |↑  |0.3158|±  |0.1096|
+|ceval-valid_physician                               |      1|none            |     0|acc        |↑  |0.2653|±  |0.0637|
+|ceval-valid_plant_protection                        |      1|none            |     0|acc        |↑  |0.2273|±  |0.0914|
+|ceval-valid_probability_and_statistics              |      1|none            |     0|acc        |↑  |0.3889|±  |0.1182|
+|ceval-valid_professional_tour_guide                 |      1|none            |     0|acc        |↑  |0.2759|±  |0.0845|
+|ceval-valid_sports_science                          |      1|none            |     0|acc        |↑  |0.2105|±  |0.0961|
+|ceval-valid_tax_accountant                          |      1|none            |     0|acc        |↑  |0.3265|±  |0.0677|
+|ceval-valid_teacher_qualification                   |      1|none            |     0|acc        |↑  |0.2955|±  |0.0696|
+|ceval-valid_urban_and_rural_planner                 |      1|none            |     0|acc        |↑  |0.3043|±  |0.0686|
+|ceval-valid_veterinary_medicine                     |      1|none            |     0|acc        |↑  |0.3043|±  |0.0981|
+|cmmlu                                               |N/A    |none            |     0|acc        |↑  |0.2475|±  |0.0040|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2475|±  |0.0040|
+|cmmlu_agronomy                                      |      0|none            |     0|acc        |↑  |0.2544|±  |0.0336|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2544|±  |0.0336|
+|cmmlu_anatomy                                       |      0|none            |     0|acc        |↑  |0.2500|±  |0.0357|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2500|±  |0.0357|
+|cmmlu_ancient_chinese                               |      0|none            |     0|acc        |↑  |0.2134|±  |0.0321|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2134|±  |0.0321|
+|cmmlu_arts                                          |      0|none            |     0|acc        |↑  |0.2375|±  |0.0337|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2375|±  |0.0337|
+|cmmlu_astronomy                                     |      0|none            |     0|acc        |↑  |0.2424|±  |0.0335|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2424|±  |0.0335|
+|cmmlu_business_ethics                               |      0|none            |     0|acc        |↑  |0.2344|±  |0.0294|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2344|±  |0.0294|
+|cmmlu_chinese_civil_service_exam                    |      0|none            |     0|acc        |↑  |0.2500|±  |0.0343|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2500|±  |0.0343|
+|cmmlu_chinese_driving_rule                          |      0|none            |     0|acc        |↑  |0.2366|±  |0.0373|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2366|±  |0.0373|
+|cmmlu_chinese_food_culture                          |      0|none            |     0|acc        |↑  |0.2353|±  |0.0365|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2353|±  |0.0365|
+|cmmlu_chinese_foreign_policy                        |      0|none            |     0|acc        |↑  |0.2430|±  |0.0417|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2430|±  |0.0417|
+|cmmlu_chinese_history                               |      0|none            |     0|acc        |↑  |0.2508|±  |0.0242|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2508|±  |0.0242|
+|cmmlu_chinese_literature                            |      0|none            |     0|acc        |↑  |0.2353|±  |0.0298|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2353|±  |0.0298|
+|cmmlu_chinese_teacher_qualification                 |      0|none            |     0|acc        |↑  |0.2235|±  |0.0312|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2235|±  |0.0312|
+|cmmlu_clinical_knowledge                            |      0|none            |     0|acc        |↑  |0.2278|±  |0.0273|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2278|±  |0.0273|
+|cmmlu_college_actuarial_science                     |      0|none            |     0|acc        |↑  |0.2170|±  |0.0402|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2170|±  |0.0402|
+|cmmlu_college_education                             |      0|none            |     0|acc        |↑  |0.3271|±  |0.0456|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.3271|±  |0.0456|
+|cmmlu_college_engineering_hydrology                 |      0|none            |     0|acc        |↑  |0.2642|±  |0.0430|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2642|±  |0.0430|
+|cmmlu_college_law                                   |      0|none            |     0|acc        |↑  |0.2222|±  |0.0402|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2222|±  |0.0402|
+|cmmlu_college_mathematics                           |      0|none            |     0|acc        |↑  |0.2095|±  |0.0399|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2095|±  |0.0399|
+|cmmlu_college_medical_statistics                    |      0|none            |     0|acc        |↑  |0.2547|±  |0.0425|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2547|±  |0.0425|
+|cmmlu_college_medicine                              |      0|none            |     0|acc        |↑  |0.2784|±  |0.0272|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2784|±  |0.0272|
+|cmmlu_computer_science                              |      0|none            |     0|acc        |↑  |0.2157|±  |0.0289|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2157|±  |0.0289|
+|cmmlu_computer_security                             |      0|none            |     0|acc        |↑  |0.2632|±  |0.0338|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2632|±  |0.0338|
+|cmmlu_conceptual_physics                            |      0|none            |     0|acc        |↑  |0.2653|±  |0.0365|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2653|±  |0.0365|
+|cmmlu_construction_project_management               |      0|none            |     0|acc        |↑  |0.2446|±  |0.0366|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2446|±  |0.0366|
+|cmmlu_economics                                     |      0|none            |     0|acc        |↑  |0.2579|±  |0.0348|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2579|±  |0.0348|
+|cmmlu_education                                     |      0|none            |     0|acc        |↑  |0.2270|±  |0.0329|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2270|±  |0.0329|
+|cmmlu_electrical_engineering                        |      0|none            |     0|acc        |↑  |0.2500|±  |0.0331|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2500|±  |0.0331|
+|cmmlu_elementary_chinese                            |      0|none            |     0|acc        |↑  |0.2341|±  |0.0267|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2341|±  |0.0267|
+|cmmlu_elementary_commonsense                        |      0|none            |     0|acc        |↑  |0.2626|±  |0.0314|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2626|±  |0.0314|
+|cmmlu_elementary_information_and_technology         |      0|none            |     0|acc        |↑  |0.2479|±  |0.0280|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2479|±  |0.0280|
+|cmmlu_elementary_mathematics                        |      0|none            |     0|acc        |↑  |0.2957|±  |0.0302|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2957|±  |0.0302|
+|cmmlu_ethnology                                     |      0|none            |     0|acc        |↑  |0.2963|±  |0.0394|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2963|±  |0.0394|
+|cmmlu_food_science                                  |      0|none            |     0|acc        |↑  |0.2587|±  |0.0368|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2587|±  |0.0368|
+|cmmlu_genetics                                      |      0|none            |     0|acc        |↑  |0.2386|±  |0.0322|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2386|±  |0.0322|
+|cmmlu_global_facts                                  |      0|none            |     0|acc        |↑  |0.2752|±  |0.0367|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2752|±  |0.0367|
+|cmmlu_high_school_biology                           |      0|none            |     0|acc        |↑  |0.2249|±  |0.0322|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2249|±  |0.0322|
+|cmmlu_high_school_chemistry                         |      0|none            |     0|acc        |↑  |0.2652|±  |0.0386|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2652|±  |0.0386|
+|cmmlu_high_school_geography                         |      0|none            |     0|acc        |↑  |0.2288|±  |0.0388|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2288|±  |0.0388|
+|cmmlu_high_school_mathematics                       |      0|none            |     0|acc        |↑  |0.2561|±  |0.0342|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2561|±  |0.0342|
+|cmmlu_high_school_physics                           |      0|none            |     0|acc        |↑  |0.1636|±  |0.0354|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.1636|±  |0.0354|
+|cmmlu_high_school_politics                          |      0|none            |     0|acc        |↑  |0.2378|±  |0.0357|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2378|±  |0.0357|
+|cmmlu_human_sexuality                               |      0|none            |     0|acc        |↑  |0.2222|±  |0.0372|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2222|±  |0.0372|
+|cmmlu_international_law                             |      0|none            |     0|acc        |↑  |0.2432|±  |0.0316|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2432|±  |0.0316|
+|cmmlu_journalism                                    |      0|none            |     0|acc        |↑  |0.2674|±  |0.0338|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2674|±  |0.0338|
+|cmmlu_jurisprudence                                 |      0|none            |     0|acc        |↑  |0.2482|±  |0.0213|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2482|±  |0.0213|
+|cmmlu_legal_and_moral_basis                         |      0|none            |     0|acc        |↑  |0.2617|±  |0.0301|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2617|±  |0.0301|
+|cmmlu_logical                                       |      0|none            |     0|acc        |↑  |0.2033|±  |0.0364|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2033|±  |0.0364|
+|cmmlu_machine_learning                              |      0|none            |     0|acc        |↑  |0.3279|±  |0.0427|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.3279|±  |0.0427|
+|cmmlu_management                                    |      0|none            |     0|acc        |↑  |0.2190|±  |0.0286|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2190|±  |0.0286|
+|cmmlu_marketing                                     |      0|none            |     0|acc        |↑  |0.2056|±  |0.0302|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2056|±  |0.0302|
+|cmmlu_marxist_theory                                |      0|none            |     0|acc        |↑  |0.2540|±  |0.0317|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2540|±  |0.0317|
+|cmmlu_modern_chinese                                |      0|none            |     0|acc        |↑  |0.2241|±  |0.0389|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2241|±  |0.0389|
+|cmmlu_nutrition                                     |      0|none            |     0|acc        |↑  |0.2483|±  |0.0360|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2483|±  |0.0360|
+|cmmlu_philosophy                                    |      0|none            |     0|acc        |↑  |0.2571|±  |0.0429|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2571|±  |0.0429|
+|cmmlu_professional_accounting                       |      0|none            |     0|acc        |↑  |0.2914|±  |0.0344|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2914|±  |0.0344|
+|cmmlu_professional_law                              |      0|none            |     0|acc        |↑  |0.2038|±  |0.0278|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2038|±  |0.0278|
+|cmmlu_professional_medicine                         |      0|none            |     0|acc        |↑  |0.2527|±  |0.0224|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2527|±  |0.0224|
+|cmmlu_professional_psychology                       |      0|none            |     0|acc        |↑  |0.2586|±  |0.0288|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2586|±  |0.0288|
+|cmmlu_public_relations                              |      0|none            |     0|acc        |↑  |0.2644|±  |0.0335|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2644|±  |0.0335|
+|cmmlu_security_study                                |      0|none            |     0|acc        |↑  |0.2741|±  |0.0385|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2741|±  |0.0385|
+|cmmlu_sociology                                     |      0|none            |     0|acc        |↑  |0.2743|±  |0.0297|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2743|±  |0.0297|
+|cmmlu_sports_science                                |      0|none            |     0|acc        |↑  |0.2545|±  |0.0340|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2545|±  |0.0340|
+|cmmlu_traditional_chinese_medicine                  |      0|none            |     0|acc        |↑  |0.2541|±  |0.0321|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2541|±  |0.0321|
+|cmmlu_virology                                      |      0|none            |     0|acc        |↑  |0.2485|±  |0.0333|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2485|±  |0.0333|
+|cmmlu_world_history                                 |      0|none            |     0|acc        |↑  |0.2484|±  |0.0342|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2484|±  |0.0342|
+|cmmlu_world_religions                               |      0|none            |     0|acc        |↑  |0.2250|±  |0.0331|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.2250|±  |0.0331|
+|gsm8k_cot                                           |      3|flexible-extract|     8|exact_match|↑  |0.0152|±  |0.0034|
+|                                                    |       |strict-match    |     8|exact_match|↑  |0.0061|±  |0.0021|
+|hellaswag                                           |      1|none            |     0|acc        |↑  |0.2996|±  |0.0046|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.3276|±  |0.0047|
+|mmlu                                                |N/A    |none            |     0|acc        |↑  |0.2479|±  |0.0036|
+|mmlu_abstract_algebra                               |      0|none            |     0|acc        |↑  |0.2600|±  |0.0441|
+|mmlu_anatomy                                        |      0|none            |     0|acc        |↑  |0.2741|±  |0.0385|
+|mmlu_astronomy                                      |      0|none            |     0|acc        |↑  |0.2105|±  |0.0332|
+|mmlu_business_ethics                                |      0|none            |     0|acc        |↑  |0.2600|±  |0.0441|
+|mmlu_clinical_knowledge                             |      0|none            |     0|acc        |↑  |0.2679|±  |0.0273|
+|mmlu_college_biology                                |      0|none            |     0|acc        |↑  |0.2292|±  |0.0351|
+|mmlu_college_chemistry                              |      0|none            |     0|acc        |↑  |0.2600|±  |0.0441|
+|mmlu_college_computer_science                       |      0|none            |     0|acc        |↑  |0.1800|±  |0.0386|
+|mmlu_college_mathematics                            |      0|none            |     0|acc        |↑  |0.2800|±  |0.0451|
+|mmlu_college_medicine                               |      0|none            |     0|acc        |↑  |0.2197|±  |0.0316|
+|mmlu_college_physics                                |      0|none            |     0|acc        |↑  |0.2353|±  |0.0422|
+|mmlu_computer_security                              |      0|none            |     0|acc        |↑  |0.2500|±  |0.0435|
+|mmlu_conceptual_physics                             |      0|none            |     0|acc        |↑  |0.3404|±  |0.0310|
+|mmlu_econometrics                                   |      0|none            |     0|acc        |↑  |0.2632|±  |0.0414|
+|mmlu_electrical_engineering                         |      0|none            |     0|acc        |↑  |0.1931|±  |0.0329|
+|mmlu_elementary_mathematics                         |      0|none            |     0|acc        |↑  |0.2672|±  |0.0228|
+|mmlu_formal_logic                                   |      0|none            |     0|acc        |↑  |0.2381|±  |0.0381|
+|mmlu_global_facts                                   |      0|none            |     0|acc        |↑  |0.2000|±  |0.0402|
+|mmlu_high_school_biology                            |      0|none            |     0|acc        |↑  |0.2613|±  |0.0250|
+|mmlu_high_school_chemistry                          |      0|none            |     0|acc        |↑  |0.2217|±  |0.0292|
+|mmlu_high_school_computer_science                   |      0|none            |     0|acc        |↑  |0.2700|±  |0.0446|
+|mmlu_high_school_european_history                   |      0|none            |     0|acc        |↑  |0.2667|±  |0.0345|
+|mmlu_high_school_geography                          |      0|none            |     0|acc        |↑  |0.2020|±  |0.0286|
+|mmlu_high_school_government_and_politics            |      0|none            |     0|acc        |↑  |0.2332|±  |0.0305|
+|mmlu_high_school_macroeconomics                     |      0|none            |     0|acc        |↑  |0.2385|±  |0.0216|
+|mmlu_high_school_mathematics                        |      0|none            |     0|acc        |↑  |0.2556|±  |0.0266|
+|mmlu_high_school_microeconomics                     |      0|none            |     0|acc        |↑  |0.2353|±  |0.0276|
+|mmlu_high_school_physics                            |      0|none            |     0|acc        |↑  |0.2185|±  |0.0337|
+|mmlu_high_school_psychology                         |      0|none            |     0|acc        |↑  |0.2257|±  |0.0179|
+|mmlu_high_school_statistics                         |      0|none            |     0|acc        |↑  |0.1667|±  |0.0254|
+|mmlu_high_school_us_history                         |      0|none            |     0|acc        |↑  |0.2794|±  |0.0315|
+|mmlu_high_school_world_history                      |      0|none            |     0|acc        |↑  |0.2405|±  |0.0278|
+|mmlu_human_aging                                    |      0|none            |     0|acc        |↑  |0.3632|±  |0.0323|
+|mmlu_human_sexuality                                |      0|none            |     0|acc        |↑  |0.2443|±  |0.0377|
+|mmlu_humanities                                     |N/A    |none            |     0|acc        |↑  |0.2497|±  |0.0063|
+|mmlu_international_law                              |      0|none            |     0|acc        |↑  |0.2479|±  |0.0394|
+|mmlu_jurisprudence                                  |      0|none            |     0|acc        |↑  |0.3056|±  |0.0445|
+|mmlu_logical_fallacies                              |      0|none            |     0|acc        |↑  |0.2454|±  |0.0338|
+|mmlu_machine_learning                               |      0|none            |     0|acc        |↑  |0.2768|±  |0.0425|
+|mmlu_management                                     |      0|none            |     0|acc        |↑  |0.2621|±  |0.0435|
+|mmlu_marketing                                      |      0|none            |     0|acc        |↑  |0.2436|±  |0.0281|
+|mmlu_medical_genetics                               |      0|none            |     0|acc        |↑  |0.3300|±  |0.0473|
+|mmlu_miscellaneous                                  |      0|none            |     0|acc        |↑  |0.2452|±  |0.0154|
+|mmlu_moral_disputes                                 |      0|none            |     0|acc        |↑  |0.2572|±  |0.0235|
+|mmlu_moral_scenarios                                |      0|none            |     0|acc        |↑  |0.2391|±  |0.0143|
+|mmlu_nutrition                                      |      0|none            |     0|acc        |↑  |0.2092|±  |0.0233|
+|mmlu_other                                          |N/A    |none            |     0|acc        |↑  |0.2556|±  |0.0078|
+|mmlu_philosophy                                     |      0|none            |     0|acc        |↑  |0.2637|±  |0.0250|
+|mmlu_prehistory                                     |      0|none            |     0|acc        |↑  |0.2531|±  |0.0242|
+|mmlu_professional_accounting                        |      0|none            |     0|acc        |↑  |0.2766|±  |0.0267|
+|mmlu_professional_law                               |      0|none            |     0|acc        |↑  |0.2438|±  |0.0110|
+|mmlu_professional_medicine                          |      0|none            |     0|acc        |↑  |0.2022|±  |0.0244|
+|mmlu_professional_psychology                        |      0|none            |     0|acc        |↑  |0.2598|±  |0.0177|
+|mmlu_public_relations                               |      0|none            |     0|acc        |↑  |0.2818|±  |0.0431|
+|mmlu_security_studies                               |      0|none            |     0|acc        |↑  |0.1714|±  |0.0241|
+|mmlu_social_sciences                                |N/A    |none            |     0|acc        |↑  |0.2379|±  |0.0077|
+|mmlu_sociology                                      |      0|none            |     0|acc        |↑  |0.2836|±  |0.0319|
+|mmlu_stem                                           |N/A    |none            |     0|acc        |↑  |0.2474|±  |0.0077|
+|mmlu_us_foreign_policy                              |      0|none            |     0|acc        |↑  |0.2400|±  |0.0429|
+|mmlu_virology                                       |      0|none            |     0|acc        |↑  |0.3133|±  |0.0361|
+|mmlu_world_religions                                |      0|none            |     0|acc        |↑  |0.2515|±  |0.0333|
+|piqa                                                |      1|none            |     0|acc        |↑  |0.6279|±  |0.0113|
+|                                                    |       |none            |     0|acc_norm   |↑  |0.6289|±  |0.0113|
+|winogrande                                          |      1|none            |     0|acc        |↑  |0.4862|±  |0.0140|
+|       Groups       |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
+|--------------------|-------|------|-----:|--------|---|-----:|---|-----:|
+|ceval-valid         |N/A    |none  |     0|acc     |↑  |0.2623|±  |0.0120|
+|cmmlu               |N/A    |none  |     0|acc     |↑  |0.2475|±  |0.0040|
+|                    |       |none  |     0|acc_norm|↑  |0.2475|±  |0.0040|
+|mmlu                |N/A    |none  |     0|acc     |↑  |0.2479|±  |0.0036|
+|mmlu_humanities     |N/A    |none  |     0|acc     |↑  |0.2497|±  |0.0063|
+|mmlu_other          |N/A    |none  |     0|acc     |↑  |0.2556|±  |0.0078|
+|mmlu_social_sciences|N/A    |none  |     0|acc     |↑  |0.2379|±  |0.0077|
+|mmlu_stem           |N/A    |none  |     0|acc     |↑  |0.2474|±  |0.0077|

checkpoint-4500/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.45.2"
+}

checkpoint-4500/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step4500

checkpoint-4500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e57461eb6f9083c0af1438e2952e05fb9eb20f2b6ffc50cc0b827b25158f08dc
+size 748256328

checkpoint-4500/rng_state_0.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/rng_state_1.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/rng_state_2.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/rng_state_3.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/rng_state_4.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/rng_state_5.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/rng_state_6.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/rng_state_7.pth ADDED Viewed

Binary file (15.9 kB). View file

checkpoint-4500/scheduler.pt ADDED Viewed

Binary file (1.06 kB). View file

checkpoint-4500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-4500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-4500/tokenizer.model ADDED Viewed

Binary file (493 kB). View file

checkpoint-4500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "split_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-4500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3215 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.6877706435717483,
+  "eval_steps": 1000,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005972823652381664,
+      "grad_norm": 0.5743309259414673,
+      "learning_rate": 1.5904572564612327e-06,
+      "loss": 2.7537,
+      "step": 10
+    },
+    {
+      "epoch": 0.011945647304763328,
+      "grad_norm": 0.5460094809532166,
+      "learning_rate": 3.1809145129224655e-06,
+      "loss": 2.7612,
+      "step": 20
+    },
+    {
+      "epoch": 0.01791847095714499,
+      "grad_norm": 0.5363145470619202,
+      "learning_rate": 4.7713717693836985e-06,
+      "loss": 2.7609,
+      "step": 30
+    },
+    {
+      "epoch": 0.023891294609526655,
+      "grad_norm": 0.5279455184936523,
+      "learning_rate": 6.361829025844931e-06,
+      "loss": 2.7607,
+      "step": 40
+    },
+    {
+      "epoch": 0.029864118261908316,
+      "grad_norm": 0.5061234831809998,
+      "learning_rate": 7.952286282306164e-06,
+      "loss": 2.784,
+      "step": 50
+    },
+    {
+      "epoch": 0.03583694191428998,
+      "grad_norm": 0.476898729801178,
+      "learning_rate": 9.542743538767397e-06,
+      "loss": 2.762,
+      "step": 60
+    },
+    {
+      "epoch": 0.041809765566671646,
+      "grad_norm": 0.4454072415828705,
+      "learning_rate": 1.113320079522863e-05,
+      "loss": 2.7716,
+      "step": 70
+    },
+    {
+      "epoch": 0.04778258921905331,
+      "grad_norm": 3.1541287899017334,
+      "learning_rate": 1.2723658051689862e-05,
+      "loss": 2.8849,
+      "step": 80
+    },
+    {
+      "epoch": 0.05375541287143497,
+      "grad_norm": 0.19107532501220703,
+      "learning_rate": 1.4314115308151095e-05,
+      "loss": 3.1147,
+      "step": 90
+    },
+    {
+      "epoch": 0.05972823652381663,
+      "grad_norm": 0.13281038403511047,
+      "learning_rate": 1.590457256461233e-05,
+      "loss": 2.5574,
+      "step": 100
+    },
+    {
+      "epoch": 0.0657010601761983,
+      "grad_norm": 0.08191326260566711,
+      "learning_rate": 1.749502982107356e-05,
+      "loss": 2.4446,
+      "step": 110
+    },
+    {
+      "epoch": 0.07167388382857996,
+      "grad_norm": 0.08300579339265823,
+      "learning_rate": 1.9085487077534794e-05,
+      "loss": 2.3524,
+      "step": 120
+    },
+    {
+      "epoch": 0.07764670748096163,
+      "grad_norm": 0.0590679906308651,
+      "learning_rate": 2.0675944333996028e-05,
+      "loss": 2.2819,
+      "step": 130
+    },
+    {
+      "epoch": 0.08361953113334329,
+      "grad_norm": 0.052923623472452164,
+      "learning_rate": 2.226640159045726e-05,
+      "loss": 2.2261,
+      "step": 140
+    },
+    {
+      "epoch": 0.08959235478572496,
+      "grad_norm": 0.05208205804228783,
+      "learning_rate": 2.385685884691849e-05,
+      "loss": 2.1889,
+      "step": 150
+    },
+    {
+      "epoch": 0.09556517843810662,
+      "grad_norm": 0.0485885925590992,
+      "learning_rate": 2.5447316103379724e-05,
+      "loss": 2.1694,
+      "step": 160
+    },
+    {
+      "epoch": 0.10153800209048827,
+      "grad_norm": 0.04901551082730293,
+      "learning_rate": 2.7037773359840955e-05,
+      "loss": 2.1272,
+      "step": 170
+    },
+    {
+      "epoch": 0.10751082574286994,
+      "grad_norm": 0.04524153470993042,
+      "learning_rate": 2.862823061630219e-05,
+      "loss": 2.1085,
+      "step": 180
+    },
+    {
+      "epoch": 0.1134836493952516,
+      "grad_norm": 0.04201298579573631,
+      "learning_rate": 3.021868787276342e-05,
+      "loss": 2.0902,
+      "step": 190
+    },
+    {
+      "epoch": 0.11945647304763327,
+      "grad_norm": 0.053612083196640015,
+      "learning_rate": 3.180914512922466e-05,
+      "loss": 2.0855,
+      "step": 200
+    },
+    {
+      "epoch": 0.12542929670001493,
+      "grad_norm": 0.04812688007950783,
+      "learning_rate": 3.3399602385685885e-05,
+      "loss": 2.0469,
+      "step": 210
+    },
+    {
+      "epoch": 0.1314021203523966,
+      "grad_norm": 0.0483262836933136,
+      "learning_rate": 3.499005964214712e-05,
+      "loss": 2.0264,
+      "step": 220
+    },
+    {
+      "epoch": 0.13737494400477826,
+      "grad_norm": 0.05456310138106346,
+      "learning_rate": 3.6580516898608353e-05,
+      "loss": 2.0201,
+      "step": 230
+    },
+    {
+      "epoch": 0.14334776765715992,
+      "grad_norm": 0.06978671252727509,
+      "learning_rate": 3.817097415506959e-05,
+      "loss": 1.9967,
+      "step": 240
+    },
+    {
+      "epoch": 0.1493205913095416,
+      "grad_norm": 0.049219317734241486,
+      "learning_rate": 3.976143141153082e-05,
+      "loss": 1.9909,
+      "step": 250
+    },
+    {
+      "epoch": 0.15529341496192325,
+      "grad_norm": 0.04814588651061058,
+      "learning_rate": 4.1351888667992056e-05,
+      "loss": 1.9793,
+      "step": 260
+    },
+    {
+      "epoch": 0.16126623861430492,
+      "grad_norm": 0.06128086522221565,
+      "learning_rate": 4.2942345924453284e-05,
+      "loss": 1.9703,
+      "step": 270
+    },
+    {
+      "epoch": 0.16723906226668658,
+      "grad_norm": 0.06803273409605026,
+      "learning_rate": 4.453280318091452e-05,
+      "loss": 1.9484,
+      "step": 280
+    },
+    {
+      "epoch": 0.17321188591906825,
+      "grad_norm": 0.06598497182130814,
+      "learning_rate": 4.612326043737575e-05,
+      "loss": 1.9251,
+      "step": 290
+    },
+    {
+      "epoch": 0.1791847095714499,
+      "grad_norm": 0.05581754818558693,
+      "learning_rate": 4.771371769383698e-05,
+      "loss": 1.9211,
+      "step": 300
+    },
+    {
+      "epoch": 0.18515753322383158,
+      "grad_norm": 0.06264442205429077,
+      "learning_rate": 4.9304174950298214e-05,
+      "loss": 1.9047,
+      "step": 310
+    },
+    {
+      "epoch": 0.19113035687621324,
+      "grad_norm": 0.05809122323989868,
+      "learning_rate": 5.089463220675945e-05,
+      "loss": 1.8948,
+      "step": 320
+    },
+    {
+      "epoch": 0.1971031805285949,
+      "grad_norm": 0.05478562042117119,
+      "learning_rate": 5.248508946322068e-05,
+      "loss": 1.8924,
+      "step": 330
+    },
+    {
+      "epoch": 0.20307600418097654,
+      "grad_norm": 0.060149796307086945,
+      "learning_rate": 5.407554671968191e-05,
+      "loss": 1.8776,
+      "step": 340
+    },
+    {
+      "epoch": 0.2090488278333582,
+      "grad_norm": 0.06282585859298706,
+      "learning_rate": 5.5666003976143144e-05,
+      "loss": 1.8752,
+      "step": 350
+    },
+    {
+      "epoch": 0.21502165148573987,
+      "grad_norm": 0.06441989541053772,
+      "learning_rate": 5.725646123260438e-05,
+      "loss": 1.8632,
+      "step": 360
+    },
+    {
+      "epoch": 0.22099447513812154,
+      "grad_norm": 0.05681062117218971,
+      "learning_rate": 5.8846918489065606e-05,
+      "loss": 1.8475,
+      "step": 370
+    },
+    {
+      "epoch": 0.2269672987905032,
+      "grad_norm": 0.05155131593346596,
+      "learning_rate": 6.043737574552684e-05,
+      "loss": 1.8431,
+      "step": 380
+    },
+    {
+      "epoch": 0.23294012244288487,
+      "grad_norm": 0.05347074940800667,
+      "learning_rate": 6.202783300198807e-05,
+      "loss": 1.8416,
+      "step": 390
+    },
+    {
+      "epoch": 0.23891294609526653,
+      "grad_norm": 0.06694310158491135,
+      "learning_rate": 6.361829025844931e-05,
+      "loss": 1.8344,
+      "step": 400
+    },
+    {
+      "epoch": 0.2448857697476482,
+      "grad_norm": 0.06079185754060745,
+      "learning_rate": 6.520874751491054e-05,
+      "loss": 1.8297,
+      "step": 410
+    },
+    {
+      "epoch": 0.25085859340002986,
+      "grad_norm": 0.05415233224630356,
+      "learning_rate": 6.679920477137177e-05,
+      "loss": 1.82,
+      "step": 420
+    },
+    {
+      "epoch": 0.2568314170524115,
+      "grad_norm": 0.0645110234618187,
+      "learning_rate": 6.838966202783301e-05,
+      "loss": 1.8137,
+      "step": 430
+    },
+    {
+      "epoch": 0.2628042407047932,
+      "grad_norm": 0.06045007333159447,
+      "learning_rate": 6.998011928429424e-05,
+      "loss": 1.8048,
+      "step": 440
+    },
+    {
+      "epoch": 0.26877706435717486,
+      "grad_norm": 0.05600131303071976,
+      "learning_rate": 7.157057654075547e-05,
+      "loss": 1.7854,
+      "step": 450
+    },
+    {
+      "epoch": 0.2747498880095565,
+      "grad_norm": 0.06498062610626221,
+      "learning_rate": 7.316103379721671e-05,
+      "loss": 1.798,
+      "step": 460
+    },
+    {
+      "epoch": 0.2807227116619382,
+      "grad_norm": 0.053577929735183716,
+      "learning_rate": 7.475149105367795e-05,
+      "loss": 1.7883,
+      "step": 470
+    },
+    {
+      "epoch": 0.28669553531431985,
+      "grad_norm": 0.09097382426261902,
+      "learning_rate": 7.634194831013918e-05,
+      "loss": 1.78,
+      "step": 480
+    },
+    {
+      "epoch": 0.2926683589667015,
+      "grad_norm": 0.057212598621845245,
+      "learning_rate": 7.79324055666004e-05,
+      "loss": 1.7705,
+      "step": 490
+    },
+    {
+      "epoch": 0.2986411826190832,
+      "grad_norm": 0.055311623960733414,
+      "learning_rate": 7.952286282306164e-05,
+      "loss": 1.7739,
+      "step": 500
+    },
+    {
+      "epoch": 0.30461400627146484,
+      "grad_norm": 0.07679615169763565,
+      "learning_rate": 7.999952636882403e-05,
+      "loss": 1.7705,
+      "step": 510
+    },
+    {
+      "epoch": 0.3105868299238465,
+      "grad_norm": 0.10281822085380554,
+      "learning_rate": 7.999720656965739e-05,
+      "loss": 1.7639,
+      "step": 520
+    },
+    {
+      "epoch": 0.3165596535762282,
+      "grad_norm": 0.07636060565710068,
+      "learning_rate": 7.999295372099362e-05,
+      "loss": 1.7539,
+      "step": 530
+    },
+    {
+      "epoch": 0.32253247722860984,
+      "grad_norm": 0.057714689522981644,
+      "learning_rate": 7.998676802837124e-05,
+      "loss": 1.7541,
+      "step": 540
+    },
+    {
+      "epoch": 0.3285053008809915,
+      "grad_norm": 0.06505981832742691,
+      "learning_rate": 7.997864979074237e-05,
+      "loss": 1.7487,
+      "step": 550
+    },
+    {
+      "epoch": 0.33447812453337317,
+      "grad_norm": 0.05842842161655426,
+      "learning_rate": 7.996859940045832e-05,
+      "loss": 1.739,
+      "step": 560
+    },
+    {
+      "epoch": 0.34045094818575483,
+      "grad_norm": 0.051559966057538986,
+      "learning_rate": 7.995661734325054e-05,
+      "loss": 1.7443,
+      "step": 570
+    },
+    {
+      "epoch": 0.3464237718381365,
+      "grad_norm": 0.20853149890899658,
+      "learning_rate": 7.994270419820721e-05,
+      "loss": 1.7719,
+      "step": 580
+    },
+    {
+      "epoch": 0.35239659549051816,
+      "grad_norm": 0.09151974320411682,
+      "learning_rate": 7.992686063774525e-05,
+      "loss": 1.7817,
+      "step": 590
+    },
+    {
+      "epoch": 0.3583694191428998,
+      "grad_norm": 0.05926055088639259,
+      "learning_rate": 7.99090874275778e-05,
+      "loss": 1.7469,
+      "step": 600
+    },
+    {
+      "epoch": 0.3643422427952815,
+      "grad_norm": 0.044228848069906235,
+      "learning_rate": 7.988938542667721e-05,
+      "loss": 1.7393,
+      "step": 610
+    },
+    {
+      "epoch": 0.37031506644766315,
+      "grad_norm": 0.0427553653717041,
+      "learning_rate": 7.986775558723355e-05,
+      "loss": 1.7307,
+      "step": 620
+    },
+    {
+      "epoch": 0.3762878901000448,
+      "grad_norm": 0.0548509880900383,
+      "learning_rate": 7.984419895460858e-05,
+      "loss": 1.7205,
+      "step": 630
+    },
+    {
+      "epoch": 0.3822607137524265,
+      "grad_norm": 0.057041749358177185,
+      "learning_rate": 7.981871666728525e-05,
+      "loss": 1.7225,
+      "step": 640
+    },
+    {
+      "epoch": 0.38823353740480815,
+      "grad_norm": 0.056601762771606445,
+      "learning_rate": 7.979130995681263e-05,
+      "loss": 1.7088,
+      "step": 650
+    },
+    {
+      "epoch": 0.3942063610571898,
+      "grad_norm": 0.06844093650579453,
+      "learning_rate": 7.976198014774637e-05,
+      "loss": 1.7073,
+      "step": 660
+    },
+    {
+      "epoch": 0.4001791847095714,
+      "grad_norm": 0.0546780526638031,
+      "learning_rate": 7.973072865758483e-05,
+      "loss": 1.7121,
+      "step": 670
+    },
+    {
+      "epoch": 0.4061520083619531,
+      "grad_norm": 0.04654558375477791,
+      "learning_rate": 7.969755699670041e-05,
+      "loss": 1.6951,
+      "step": 680
+    },
+    {
+      "epoch": 0.41212483201433475,
+      "grad_norm": 0.06478898227214813,
+      "learning_rate": 7.966246676826661e-05,
+      "loss": 1.7055,
+      "step": 690
+    },
+    {
+      "epoch": 0.4180976556667164,
+      "grad_norm": 0.06878198683261871,
+      "learning_rate": 7.962545966818062e-05,
+      "loss": 1.6987,
+      "step": 700
+    },
+    {
+      "epoch": 0.4240704793190981,
+      "grad_norm": 0.05675249919295311,
+      "learning_rate": 7.95865374849812e-05,
+      "loss": 1.6998,
+      "step": 710
+    },
+    {
+      "epoch": 0.43004330297147975,
+      "grad_norm": 0.05516457185149193,
+      "learning_rate": 7.954570209976239e-05,
+      "loss": 1.6852,
+      "step": 720
+    },
+    {
+      "epoch": 0.4360161266238614,
+      "grad_norm": 0.05688585340976715,
+      "learning_rate": 7.950295548608256e-05,
+      "loss": 1.6901,
+      "step": 730
+    },
+    {
+      "epoch": 0.4419889502762431,
+      "grad_norm": 0.07187242805957794,
+      "learning_rate": 7.945829970986898e-05,
+      "loss": 1.6894,
+      "step": 740
+    },
+    {
+      "epoch": 0.44796177392862474,
+      "grad_norm": 0.0548662506043911,
+      "learning_rate": 7.941173692931801e-05,
+      "loss": 1.6819,
+      "step": 750
+    },
+    {
+      "epoch": 0.4539345975810064,
+      "grad_norm": 0.0926741436123848,
+      "learning_rate": 7.93632693947908e-05,
+      "loss": 1.6797,
+      "step": 760
+    },
+    {
+      "epoch": 0.45990742123338807,
+      "grad_norm": 0.04921697825193405,
+      "learning_rate": 7.931289944870448e-05,
+      "loss": 1.6629,
+      "step": 770
+    },
+    {
+      "epoch": 0.46588024488576973,
+      "grad_norm": 0.07487112283706665,
+      "learning_rate": 7.92606295254191e-05,
+      "loss": 1.6737,
+      "step": 780
+    },
+    {
+      "epoch": 0.4718530685381514,
+      "grad_norm": 0.07180643826723099,
+      "learning_rate": 7.920646215111973e-05,
+      "loss": 1.6716,
+      "step": 790
+    },
+    {
+      "epoch": 0.47782589219053306,
+      "grad_norm": 0.050522662699222565,
+      "learning_rate": 7.915039994369462e-05,
+      "loss": 1.6597,
+      "step": 800
+    },
+    {
+      "epoch": 0.48379871584291473,
+      "grad_norm": 0.0628654807806015,
+      "learning_rate": 7.909244561260855e-05,
+      "loss": 1.6722,
+      "step": 810
+    },
+    {
+      "epoch": 0.4897715394952964,
+      "grad_norm": 0.07348821312189102,
+      "learning_rate": 7.903260195877184e-05,
+      "loss": 1.6718,
+      "step": 820
+    },
+    {
+      "epoch": 0.49574436314767806,
+      "grad_norm": 0.0689951702952385,
+      "learning_rate": 7.897087187440512e-05,
+      "loss": 1.6658,
+      "step": 830
+    },
+    {
+      "epoch": 0.5017171868000597,
+      "grad_norm": 0.05663711205124855,
+      "learning_rate": 7.890725834289946e-05,
+      "loss": 1.6636,
+      "step": 840
+    },
+    {
+      "epoch": 0.5076900104524414,
+      "grad_norm": 0.050597622990608215,
+      "learning_rate": 7.884176443867219e-05,
+      "loss": 1.6648,
+      "step": 850
+    },
+    {
+      "epoch": 0.513662834104823,
+      "grad_norm": 0.05792626738548279,
+      "learning_rate": 7.87743933270183e-05,
+      "loss": 1.6582,
+      "step": 860
+    },
+    {
+      "epoch": 0.5196356577572048,
+      "grad_norm": 0.05193015933036804,
+      "learning_rate": 7.870514826395755e-05,
+      "loss": 1.664,
+      "step": 870
+    },
+    {
+      "epoch": 0.5256084814095864,
+      "grad_norm": 0.05836218595504761,
+      "learning_rate": 7.863403259607698e-05,
+      "loss": 1.6535,
+      "step": 880
+    },
+    {
+      "epoch": 0.531581305061968,
+      "grad_norm": 0.08420410752296448,
+      "learning_rate": 7.856104976036928e-05,
+      "loss": 1.6463,
+      "step": 890
+    },
+    {
+      "epoch": 0.5375541287143497,
+      "grad_norm": 0.06460799276828766,
+      "learning_rate": 7.848620328406663e-05,
+      "loss": 1.6615,
+      "step": 900
+    },
+    {
+      "epoch": 0.5435269523667313,
+      "grad_norm": 0.08191855251789093,
+      "learning_rate": 7.840949678447022e-05,
+      "loss": 1.6529,
+      "step": 910
+    },
+    {
+      "epoch": 0.549499776019113,
+      "grad_norm": 0.04835124313831329,
+      "learning_rate": 7.833093396877546e-05,
+      "loss": 1.6508,
+      "step": 920
+    },
+    {
+      "epoch": 0.5554725996714946,
+      "grad_norm": 0.047752317041158676,
+      "learning_rate": 7.82505186338928e-05,
+      "loss": 1.6484,
+      "step": 930
+    },
+    {
+      "epoch": 0.5614454233238764,
+      "grad_norm": 0.054417744278907776,
+      "learning_rate": 7.816825466626419e-05,
+      "loss": 1.6443,
+      "step": 940
+    },
+    {
+      "epoch": 0.567418246976258,
+      "grad_norm": 0.0538078136742115,
+      "learning_rate": 7.808414604167537e-05,
+      "loss": 1.6422,
+      "step": 950
+    },
+    {
+      "epoch": 0.5733910706286397,
+      "grad_norm": 0.04438367858529091,
+      "learning_rate": 7.799819682506353e-05,
+      "loss": 1.6443,
+      "step": 960
+    },
+    {
+      "epoch": 0.5793638942810213,
+      "grad_norm": 0.056033167988061905,
+      "learning_rate": 7.791041117032102e-05,
+      "loss": 1.6428,
+      "step": 970
+    },
+    {
+      "epoch": 0.585336717933403,
+      "grad_norm": 0.07095460593700409,
+      "learning_rate": 7.782079332009454e-05,
+      "loss": 1.6425,
+      "step": 980
+    },
+    {
+      "epoch": 0.5913095415857846,
+      "grad_norm": 0.05874691903591156,
+      "learning_rate": 7.772934760558005e-05,
+      "loss": 1.6346,
+      "step": 990
+    },
+    {
+      "epoch": 0.5972823652381664,
+      "grad_norm": 0.0521966814994812,
+      "learning_rate": 7.76360784463135e-05,
+      "loss": 1.6359,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5972823652381664,
+      "eval_loss": 1.634853482246399,
+      "eval_runtime": 28.9256,
+      "eval_samples_per_second": 1197.311,
+      "eval_steps_per_second": 9.369,
+      "step": 1000
+    },
+    {
+      "epoch": 0.603255188890548,
+      "grad_norm": 0.052664998918771744,
+      "learning_rate": 7.754099034995727e-05,
+      "loss": 1.6383,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6092280125429297,
+      "grad_norm": 0.08000710606575012,
+      "learning_rate": 7.744408791208214e-05,
+      "loss": 1.639,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6152008361953113,
+      "grad_norm": 0.05873206630349159,
+      "learning_rate": 7.734537581594545e-05,
+      "loss": 1.632,
+      "step": 1030
+    },
+    {
+      "epoch": 0.621173659847693,
+      "grad_norm": 0.06116827204823494,
+      "learning_rate": 7.724485883226454e-05,
+      "loss": 1.6351,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6271464835000746,
+      "grad_norm": 0.057659681886434555,
+      "learning_rate": 7.714254181898627e-05,
+      "loss": 1.637,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6331193071524563,
+      "grad_norm": 0.05905848369002342,
+      "learning_rate": 7.703842972105228e-05,
+      "loss": 1.626,
+      "step": 1060
+    },
+    {
+      "epoch": 0.639092130804838,
+      "grad_norm": 0.0539986751973629,
+      "learning_rate": 7.693252757015991e-05,
+      "loss": 1.6278,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6450649544572197,
+      "grad_norm": 0.062365371733903885,
+      "learning_rate": 7.682484048451908e-05,
+      "loss": 1.6187,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6510377781096013,
+      "grad_norm": 0.0486634224653244,
+      "learning_rate": 7.671537366860494e-05,
+      "loss": 1.6223,
+      "step": 1090
+    },
+    {
+      "epoch": 0.657010601761983,
+      "grad_norm": 0.04700983688235283,
+      "learning_rate": 7.660413241290626e-05,
+      "loss": 1.6237,
+      "step": 1100
+    },
+    {
+      "epoch": 0.6629834254143646,
+      "grad_norm": 0.06423746794462204,
+      "learning_rate": 7.649112209366985e-05,
+      "loss": 1.6349,
+      "step": 1110
+    },
+    {
+      "epoch": 0.6689562490667463,
+      "grad_norm": 0.05183717608451843,
+      "learning_rate": 7.637634817264064e-05,
+      "loss": 1.6203,
+      "step": 1120
+    },
+    {
+      "epoch": 0.6749290727191279,
+      "grad_norm": 0.05448286980390549,
+      "learning_rate": 7.625981619679777e-05,
+      "loss": 1.6159,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6809018963715097,
+      "grad_norm": 0.06012860685586929,
+      "learning_rate": 7.61415317980865e-05,
+      "loss": 1.6106,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6868747200238913,
+      "grad_norm": 0.0491897277534008,
+      "learning_rate": 7.602150069314598e-05,
+      "loss": 1.613,
+      "step": 1150
+    },
+    {
+      "epoch": 0.692847543676273,
+      "grad_norm": 0.05050448700785637,
+      "learning_rate": 7.589972868303301e-05,
+      "loss": 1.6158,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6988203673286546,
+      "grad_norm": 0.05027921870350838,
+      "learning_rate": 7.577622165294165e-05,
+      "loss": 1.6166,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7047931909810363,
+      "grad_norm": 0.061239466071128845,
+      "learning_rate": 7.565098557191882e-05,
+      "loss": 1.607,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7107660146334179,
+      "grad_norm": 0.04995877295732498,
+      "learning_rate": 7.552402649257578e-05,
+      "loss": 1.6152,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7167388382857997,
+      "grad_norm": 0.04830503091216087,
+      "learning_rate": 7.539535055079569e-05,
+      "loss": 1.613,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7227116619381813,
+      "grad_norm": 0.05787483602762222,
+      "learning_rate": 7.526496396543691e-05,
+      "loss": 1.614,
+      "step": 1210
+    },
+    {
+      "epoch": 0.728684485590563,
+      "grad_norm": 0.07437578588724136,
+      "learning_rate": 7.513287303803263e-05,
+      "loss": 1.6127,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7346573092429446,
+      "grad_norm": 0.06587845832109451,
+      "learning_rate": 7.499908415248616e-05,
+      "loss": 1.6015,
+      "step": 1230
+    },
+    {
+      "epoch": 0.7406301328953263,
+      "grad_norm": 0.0692521184682846,
+      "learning_rate": 7.486360377476255e-05,
+      "loss": 1.6026,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7466029565477079,
+      "grad_norm": 0.061289019882678986,
+      "learning_rate": 7.472643845257592e-05,
+      "loss": 1.6108,
+      "step": 1250
+    },
+    {
+      "epoch": 0.7525757802000896,
+      "grad_norm": 0.056076616048812866,
+      "learning_rate": 7.458759481507318e-05,
+      "loss": 1.6018,
+      "step": 1260
+    },
+    {
+      "epoch": 0.7585486038524712,
+      "grad_norm": 0.06620051711797714,
+      "learning_rate": 7.444707957251354e-05,
+      "loss": 1.6048,
+      "step": 1270
+    },
+    {
+      "epoch": 0.764521427504853,
+      "grad_norm": 0.05557152256369591,
+      "learning_rate": 7.430489951594422e-05,
+      "loss": 1.6091,
+      "step": 1280
+    },
+    {
+      "epoch": 0.7704942511572346,
+      "grad_norm": 0.04953812435269356,
+      "learning_rate": 7.416106151687224e-05,
+      "loss": 1.6026,
+      "step": 1290
+    },
+    {
+      "epoch": 0.7764670748096163,
+      "grad_norm": 0.042427971959114075,
+      "learning_rate": 7.40155725269324e-05,
+      "loss": 1.5983,
+      "step": 1300
+    },
+    {
+      "epoch": 0.7824398984619979,
+      "grad_norm": 0.05906856432557106,
+      "learning_rate": 7.386843957755123e-05,
+      "loss": 1.6008,
+      "step": 1310
+    },
+    {
+      "epoch": 0.7884127221143796,
+      "grad_norm": 0.04983474314212799,
+      "learning_rate": 7.371966977960713e-05,
+      "loss": 1.5973,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7943855457667612,
+      "grad_norm": 0.0590224526822567,
+      "learning_rate": 7.356927032308682e-05,
+      "loss": 1.6011,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8003583694191428,
+      "grad_norm": 0.057693641632795334,
+      "learning_rate": 7.341724847673775e-05,
+      "loss": 1.5942,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8063311930715246,
+      "grad_norm": 0.040723856538534164,
+      "learning_rate": 7.326361158771688e-05,
+      "loss": 1.6011,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8123040167239062,
+      "grad_norm": 0.05768086016178131,
+      "learning_rate": 7.31083670812355e-05,
+      "loss": 1.5999,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8182768403762879,
+      "grad_norm": 0.06345749646425247,
+      "learning_rate": 7.29515224602005e-05,
+      "loss": 1.5985,
+      "step": 1370
+    },
+    {
+      "epoch": 0.8242496640286695,
+      "grad_norm": 0.06176001578569412,
+      "learning_rate": 7.27930853048516e-05,
+      "loss": 1.5971,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8302224876810512,
+      "grad_norm": 0.05247745290398598,
+      "learning_rate": 7.263306327239516e-05,
+      "loss": 1.5958,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8361953113334328,
+      "grad_norm": 0.05218351632356644,
+      "learning_rate": 7.247146409663401e-05,
+      "loss": 1.5981,
+      "step": 1400
+    },
+    {
+      "epoch": 0.8421681349858146,
+      "grad_norm": 0.0629679337143898,
+      "learning_rate": 7.23082955875937e-05,
+      "loss": 1.5949,
+      "step": 1410
+    },
+    {
+      "epoch": 0.8481409586381962,
+      "grad_norm": 0.061205677688121796,
+      "learning_rate": 7.214356563114505e-05,
+      "loss": 1.5957,
+      "step": 1420
+    },
+    {
+      "epoch": 0.8541137822905779,
+      "grad_norm": 0.06122026965022087,
+      "learning_rate": 7.197728218862306e-05,
+      "loss": 1.5911,
+      "step": 1430
+    },
+    {
+      "epoch": 0.8600866059429595,
+      "grad_norm": 0.054293327033519745,
+      "learning_rate": 7.180945329644204e-05,
+      "loss": 1.5885,
+      "step": 1440
+    },
+    {
+      "epoch": 0.8660594295953412,
+      "grad_norm": 0.04569542035460472,
+      "learning_rate": 7.164008706570736e-05,
+      "loss": 1.5893,
+      "step": 1450
+    },
+    {
+      "epoch": 0.8720322532477228,
+      "grad_norm": 0.04415179416537285,
+      "learning_rate": 7.146919168182333e-05,
+      "loss": 1.5951,
+      "step": 1460
+    },
+    {
+      "epoch": 0.8780050769001045,
+      "grad_norm": 0.052418701350688934,
+      "learning_rate": 7.129677540409762e-05,
+      "loss": 1.5999,
+      "step": 1470
+    },
+    {
+      "epoch": 0.8839779005524862,
+      "grad_norm": 0.053583066910505295,
+      "learning_rate": 7.112284656534215e-05,
+      "loss": 1.5979,
+      "step": 1480
+    },
+    {
+      "epoch": 0.8899507242048679,
+      "grad_norm": 0.06733547151088715,
+      "learning_rate": 7.09474135714703e-05,
+      "loss": 1.5871,
+      "step": 1490
+    },
+    {
+      "epoch": 0.8959235478572495,
+      "grad_norm": 0.05455510690808296,
+      "learning_rate": 7.07704849010907e-05,
+      "loss": 1.5912,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9018963715096312,
+      "grad_norm": 0.05950945243239403,
+      "learning_rate": 7.059206910509745e-05,
+      "loss": 1.5958,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9078691951620128,
+      "grad_norm": 0.0513860359787941,
+      "learning_rate": 7.041217480625683e-05,
+      "loss": 1.5856,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9138420188143945,
+      "grad_norm": 0.05268612131476402,
+      "learning_rate": 7.023081069879062e-05,
+      "loss": 1.5846,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9198148424667761,
+      "grad_norm": 0.05923028290271759,
+      "learning_rate": 7.004798554795586e-05,
+      "loss": 1.5739,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9257876661191579,
+      "grad_norm": 0.04859180748462677,
+      "learning_rate": 6.986370818962125e-05,
+      "loss": 1.5927,
+      "step": 1550
+    },
+    {
+      "epoch": 0.9317604897715395,
+      "grad_norm": 0.060852836817502975,
+      "learning_rate": 6.967798752984012e-05,
+      "loss": 1.5769,
+      "step": 1560
+    },
+    {
+      "epoch": 0.9377333134239212,
+      "grad_norm": 0.053088609129190445,
+      "learning_rate": 6.949083254442001e-05,
+      "loss": 1.5845,
+      "step": 1570
+    },
+    {
+      "epoch": 0.9437061370763028,
+      "grad_norm": 0.06042907387018204,
+      "learning_rate": 6.930225227848887e-05,
+      "loss": 1.5808,
+      "step": 1580
+    },
+    {
+      "epoch": 0.9496789607286845,
+      "grad_norm": 0.05746331810951233,
+      "learning_rate": 6.911225584605787e-05,
+      "loss": 1.5821,
+      "step": 1590
+    },
+    {
+      "epoch": 0.9556517843810661,
+      "grad_norm": 0.04398033022880554,
+      "learning_rate": 6.892085242958098e-05,
+      "loss": 1.5775,
+      "step": 1600
+    },
+    {
+      "epoch": 0.9616246080334478,
+      "grad_norm": 0.050728365778923035,
+      "learning_rate": 6.872805127951115e-05,
+      "loss": 1.5749,
+      "step": 1610
+    },
+    {
+      "epoch": 0.9675974316858295,
+      "grad_norm": 0.0519120879471302,
+      "learning_rate": 6.85338617138533e-05,
+      "loss": 1.5726,
+      "step": 1620
+    },
+    {
+      "epoch": 0.9735702553382112,
+      "grad_norm": 0.052526745945215225,
+      "learning_rate": 6.833829311771388e-05,
+      "loss": 1.5793,
+      "step": 1630
+    },
+    {
+      "epoch": 0.9795430789905928,
+      "grad_norm": 0.050527602434158325,
+      "learning_rate": 6.814135494284735e-05,
+      "loss": 1.5694,
+      "step": 1640
+    },
+    {
+      "epoch": 0.9855159026429745,
+      "grad_norm": 0.08685663342475891,
+      "learning_rate": 6.794305670719945e-05,
+      "loss": 1.5803,
+      "step": 1650
+    },
+    {
+      "epoch": 0.9914887262953561,
+      "grad_norm": 0.054428499191999435,
+      "learning_rate": 6.774340799444703e-05,
+      "loss": 1.5757,
+      "step": 1660
+    },
+    {
+      "epoch": 0.9974615499477378,
+      "grad_norm": 0.05870772898197174,
+      "learning_rate": 6.754241845353506e-05,
+      "loss": 1.571,
+      "step": 1670
+    },
+    {
+      "epoch": 1.0034343736001194,
+      "grad_norm": 0.05581633001565933,
+      "learning_rate": 6.734009779821018e-05,
+      "loss": 1.5659,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0094071972525012,
+      "grad_norm": 0.05493481829762459,
+      "learning_rate": 6.713645580655125e-05,
+      "loss": 1.5686,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0153800209048829,
+      "grad_norm": 0.05471092462539673,
+      "learning_rate": 6.693150232049686e-05,
+      "loss": 1.5649,
+      "step": 1700
+    },
+    {
+      "epoch": 1.0213528445572644,
+      "grad_norm": 0.053526680916547775,
+      "learning_rate": 6.672524724536956e-05,
+      "loss": 1.5671,
+      "step": 1710
+    },
+    {
+      "epoch": 1.027325668209646,
+      "grad_norm": 0.06532900780439377,
+      "learning_rate": 6.651770054939722e-05,
+      "loss": 1.5614,
+      "step": 1720
+    },
+    {
+      "epoch": 1.0332984918620278,
+      "grad_norm": 0.051929574459791183,
+      "learning_rate": 6.630887226323128e-05,
+      "loss": 1.556,
+      "step": 1730
+    },
+    {
+      "epoch": 1.0392713155144095,
+      "grad_norm": 0.06289497762918472,
+      "learning_rate": 6.609877247946186e-05,
+      "loss": 1.5634,
+      "step": 1740
+    },
+    {
+      "epoch": 1.045244139166791,
+      "grad_norm": 0.05371445044875145,
+      "learning_rate": 6.588741135213012e-05,
+      "loss": 1.5645,
+      "step": 1750
+    },
+    {
+      "epoch": 1.0512169628191728,
+      "grad_norm": 0.04851632937788963,
+      "learning_rate": 6.567479909623746e-05,
+      "loss": 1.5648,
+      "step": 1760
+    },
+    {
+      "epoch": 1.0571897864715545,
+      "grad_norm": 0.06357111036777496,
+      "learning_rate": 6.546094598725186e-05,
+      "loss": 1.5568,
+      "step": 1770
+    },
+    {
+      "epoch": 1.063162610123936,
+      "grad_norm": 0.07035905867815018,
+      "learning_rate": 6.524586236061117e-05,
+      "loss": 1.5519,
+      "step": 1780
+    },
+    {
+      "epoch": 1.0691354337763177,
+      "grad_norm": 0.05517163127660751,
+      "learning_rate": 6.502955861122377e-05,
+      "loss": 1.5566,
+      "step": 1790
+    },
+    {
+      "epoch": 1.0751082574286994,
+      "grad_norm": 0.0504322424530983,
+      "learning_rate": 6.481204519296606e-05,
+      "loss": 1.5668,
+      "step": 1800
+    },
+    {
+      "epoch": 1.0810810810810811,
+      "grad_norm": 0.051910221576690674,
+      "learning_rate": 6.459333261817726e-05,
+      "loss": 1.5585,
+      "step": 1810
+    },
+    {
+      "epoch": 1.0870539047334629,
+      "grad_norm": 0.07319536805152893,
+      "learning_rate": 6.43734314571514e-05,
+      "loss": 1.5599,
+      "step": 1820
+    },
+    {
+      "epoch": 1.0930267283858444,
+      "grad_norm": 0.05212223529815674,
+      "learning_rate": 6.415235233762635e-05,
+      "loss": 1.5597,
+      "step": 1830
+    },
+    {
+      "epoch": 1.098999552038226,
+      "grad_norm": 0.05524059012532234,
+      "learning_rate": 6.393010594427034e-05,
+      "loss": 1.5449,
+      "step": 1840
+    },
+    {
+      "epoch": 1.1049723756906078,
+      "grad_norm": 0.044485364109277725,
+      "learning_rate": 6.370670301816544e-05,
+      "loss": 1.5584,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1109451993429893,
+      "grad_norm": 0.04716966673731804,
+      "learning_rate": 6.348215435628852e-05,
+      "loss": 1.5577,
+      "step": 1860
+    },
+    {
+      "epoch": 1.116918022995371,
+      "grad_norm": 0.04776601493358612,
+      "learning_rate": 6.32564708109894e-05,
+      "loss": 1.5597,
+      "step": 1870
+    },
+    {
+      "epoch": 1.1228908466477527,
+      "grad_norm": 0.05379948392510414,
+      "learning_rate": 6.302966328946638e-05,
+      "loss": 1.5542,
+      "step": 1880
+    },
+    {
+      "epoch": 1.1288636703001345,
+      "grad_norm": 0.05076327919960022,
+      "learning_rate": 6.280174275323915e-05,
+      "loss": 1.5564,
+      "step": 1890
+    },
+    {
+      "epoch": 1.134836493952516,
+      "grad_norm": 0.0562434047460556,
+      "learning_rate": 6.257272021761884e-05,
+      "loss": 1.5597,
+      "step": 1900
+    },
+    {
+      "epoch": 1.1408093176048977,
+      "grad_norm": 0.045845337212085724,
+      "learning_rate": 6.234260675117595e-05,
+      "loss": 1.5535,
+      "step": 1910
+    },
+    {
+      "epoch": 1.1467821412572794,
+      "grad_norm": 0.04580407217144966,
+      "learning_rate": 6.21114134752051e-05,
+      "loss": 1.5486,
+      "step": 1920
+    },
+    {
+      "epoch": 1.1527549649096611,
+      "grad_norm": 0.05752042680978775,
+      "learning_rate": 6.187915156318775e-05,
+      "loss": 1.5454,
+      "step": 1930
+    },
+    {
+      "epoch": 1.1587277885620426,
+      "grad_norm": 0.05608632043004036,
+      "learning_rate": 6.164583224025215e-05,
+      "loss": 1.5545,
+      "step": 1940
+    },
+    {
+      "epoch": 1.1647006122144243,
+      "grad_norm": 0.047604430466890335,
+      "learning_rate": 6.141146678263076e-05,
+      "loss": 1.5531,
+      "step": 1950
+    },
+    {
+      "epoch": 1.170673435866806,
+      "grad_norm": 0.04514037445187569,
+      "learning_rate": 6.117606651711537e-05,
+      "loss": 1.5547,
+      "step": 1960
+    },
+    {
+      "epoch": 1.1766462595191878,
+      "grad_norm": 0.05768571048974991,
+      "learning_rate": 6.0939642820509564e-05,
+      "loss": 1.5496,
+      "step": 1970
+    },
+    {
+      "epoch": 1.1826190831715693,
+      "grad_norm": 0.04222779721021652,
+      "learning_rate": 6.070220711907903e-05,
+      "loss": 1.5469,
+      "step": 1980
+    },
+    {
+      "epoch": 1.188591906823951,
+      "grad_norm": 0.05183190852403641,
+      "learning_rate": 6.046377088799923e-05,
+      "loss": 1.5526,
+      "step": 1990
+    },
+    {
+      "epoch": 1.1945647304763327,
+      "grad_norm": 0.04888539016246796,
+      "learning_rate": 6.0224345650800826e-05,
+      "loss": 1.5579,
+      "step": 2000
+    },
+    {
+      "epoch": 1.1945647304763327,
+      "eval_loss": 1.5546131134033203,
+      "eval_runtime": 20.1679,
+      "eval_samples_per_second": 1717.237,
+      "eval_steps_per_second": 13.437,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2005375541287144,
+      "grad_norm": 0.049841009080410004,
+      "learning_rate": 5.998394297881277e-05,
+      "loss": 1.5531,
+      "step": 2010
+    },
+    {
+      "epoch": 1.206510377781096,
+      "grad_norm": 0.04911394044756889,
+      "learning_rate": 5.974257449060306e-05,
+      "loss": 1.5512,
+      "step": 2020
+    },
+    {
+      "epoch": 1.2124832014334777,
+      "grad_norm": 0.05170886963605881,
+      "learning_rate": 5.9500251851417206e-05,
+      "loss": 1.5439,
+      "step": 2030
+    },
+    {
+      "epoch": 1.2184560250858594,
+      "grad_norm": 0.04615171626210213,
+      "learning_rate": 5.925698677261449e-05,
+      "loss": 1.5453,
+      "step": 2040
+    },
+    {
+      "epoch": 1.224428848738241,
+      "grad_norm": 0.04724368825554848,
+      "learning_rate": 5.901279101110191e-05,
+      "loss": 1.5434,
+      "step": 2050
+    },
+    {
+      "epoch": 1.2304016723906226,
+      "grad_norm": 0.06991260498762131,
+      "learning_rate": 5.8767676368766016e-05,
+      "loss": 1.5489,
+      "step": 2060
+    },
+    {
+      "epoch": 1.2363744960430043,
+      "grad_norm": 0.055575910955667496,
+      "learning_rate": 5.852165469190251e-05,
+      "loss": 1.5514,
+      "step": 2070
+    },
+    {
+      "epoch": 1.242347319695386,
+      "grad_norm": 0.04874608293175697,
+      "learning_rate": 5.82747378706437e-05,
+      "loss": 1.5523,
+      "step": 2080
+    },
+    {
+      "epoch": 1.2483201433477678,
+      "grad_norm": 0.05960864573717117,
+      "learning_rate": 5.8026937838383914e-05,
+      "loss": 1.5469,
+      "step": 2090
+    },
+    {
+      "epoch": 1.2542929670001493,
+      "grad_norm": 0.07086056470870972,
+      "learning_rate": 5.77782665712027e-05,
+      "loss": 1.5497,
+      "step": 2100
+    },
+    {
+      "epoch": 1.260265790652531,
+      "grad_norm": 0.0472436398267746,
+      "learning_rate": 5.752873608728603e-05,
+      "loss": 1.5425,
+      "step": 2110
+    },
+    {
+      "epoch": 1.2662386143049127,
+      "grad_norm": 0.06843575835227966,
+      "learning_rate": 5.7278358446345545e-05,
+      "loss": 1.542,
+      "step": 2120
+    },
+    {
+      "epoch": 1.2722114379572944,
+      "grad_norm": 0.04991114139556885,
+      "learning_rate": 5.702714574903561e-05,
+      "loss": 1.5423,
+      "step": 2130
+    },
+    {
+      "epoch": 1.278184261609676,
+      "grad_norm": 0.04601559415459633,
+      "learning_rate": 5.6775110136368576e-05,
+      "loss": 1.5357,
+      "step": 2140
+    },
+    {
+      "epoch": 1.2841570852620576,
+      "grad_norm": 0.042647868394851685,
+      "learning_rate": 5.6522263789127937e-05,
+      "loss": 1.5386,
+      "step": 2150
+    },
+    {
+      "epoch": 1.2901299089144393,
+      "grad_norm": 0.06261768937110901,
+      "learning_rate": 5.626861892727969e-05,
+      "loss": 1.5428,
+      "step": 2160
+    },
+    {
+      "epoch": 1.2961027325668208,
+      "grad_norm": 0.04735434427857399,
+      "learning_rate": 5.601418780938175e-05,
+      "loss": 1.5395,
+      "step": 2170
+    },
+    {
+      "epoch": 1.3020755562192026,
+      "grad_norm": 0.048824459314346313,
+      "learning_rate": 5.575898273199146e-05,
+      "loss": 1.5418,
+      "step": 2180
+    },
+    {
+      "epoch": 1.3080483798715843,
+      "grad_norm": 0.04974917694926262,
+      "learning_rate": 5.5503016029071354e-05,
+      "loss": 1.5371,
+      "step": 2190
+    },
+    {
+      "epoch": 1.314021203523966,
+      "grad_norm": 0.05275791883468628,
+      "learning_rate": 5.5246300071392985e-05,
+      "loss": 1.5364,
+      "step": 2200
+    },
+    {
+      "epoch": 1.3199940271763477,
+      "grad_norm": 0.0487825907766819,
+      "learning_rate": 5.4988847265939146e-05,
+      "loss": 1.5436,
+      "step": 2210
+    },
+    {
+      "epoch": 1.3259668508287292,
+      "grad_norm": 0.06100558117032051,
+      "learning_rate": 5.473067005530416e-05,
+      "loss": 1.5351,
+      "step": 2220
+    },
+    {
+      "epoch": 1.331939674481111,
+      "grad_norm": 0.07098929584026337,
+      "learning_rate": 5.447178091709262e-05,
+      "loss": 1.5463,
+      "step": 2230
+    },
+    {
+      "epoch": 1.3379124981334927,
+      "grad_norm": 0.06729080528020859,
+      "learning_rate": 5.421219236331624e-05,
+      "loss": 1.5382,
+      "step": 2240
+    },
+    {
+      "epoch": 1.3438853217858742,
+      "grad_norm": 0.05485675856471062,
+      "learning_rate": 5.395191693978927e-05,
+      "loss": 1.5349,
+      "step": 2250
+    },
+    {
+      "epoch": 1.3498581454382559,
+      "grad_norm": 0.05816954746842384,
+      "learning_rate": 5.3690967225522076e-05,
+      "loss": 1.5406,
+      "step": 2260
+    },
+    {
+      "epoch": 1.3558309690906376,
+      "grad_norm": 0.044427741318941116,
+      "learning_rate": 5.342935583211327e-05,
+      "loss": 1.5309,
+      "step": 2270
+    },
+    {
+      "epoch": 1.3618037927430193,
+      "grad_norm": 0.05544894561171532,
+      "learning_rate": 5.31670954031401e-05,
+      "loss": 1.5365,
+      "step": 2280
+    },
+    {
+      "epoch": 1.367776616395401,
+      "grad_norm": 0.04774465411901474,
+      "learning_rate": 5.290419861354753e-05,
+      "loss": 1.5303,
+      "step": 2290
+    },
+    {
+      "epoch": 1.3737494400477825,
+      "grad_norm": 0.050910986959934235,
+      "learning_rate": 5.264067816903552e-05,
+      "loss": 1.5384,
+      "step": 2300
+    },
+    {
+      "epoch": 1.3797222637001643,
+      "grad_norm": 0.05830187723040581,
+      "learning_rate": 5.2376546805445054e-05,
+      "loss": 1.535,
+      "step": 2310
+    },
+    {
+      "epoch": 1.385695087352546,
+      "grad_norm": 0.0521889254450798,
+      "learning_rate": 5.211181728814262e-05,
+      "loss": 1.5348,
+      "step": 2320
+    },
+    {
+      "epoch": 1.3916679110049275,
+      "grad_norm": 0.04742933064699173,
+      "learning_rate": 5.18465024114032e-05,
+      "loss": 1.5421,
+      "step": 2330
+    },
+    {
+      "epoch": 1.3976407346573092,
+      "grad_norm": 0.05169609189033508,
+      "learning_rate": 5.158061499779201e-05,
+      "loss": 1.5322,
+      "step": 2340
+    },
+    {
+      "epoch": 1.403613558309691,
+      "grad_norm": 0.05307742580771446,
+      "learning_rate": 5.131416789754472e-05,
+      "loss": 1.538,
+      "step": 2350
+    },
+    {
+      "epoch": 1.4095863819620726,
+      "grad_norm": 0.04581635445356369,
+      "learning_rate": 5.1047173987946474e-05,
+      "loss": 1.5313,
+      "step": 2360
+    },
+    {
+      "epoch": 1.4155592056144544,
+      "grad_norm": 0.04794102534651756,
+      "learning_rate": 5.077964617270947e-05,
+      "loss": 1.5357,
+      "step": 2370
+    },
+    {
+      "epoch": 1.4215320292668359,
+      "grad_norm": 0.043038323521614075,
+      "learning_rate": 5.051159738134937e-05,
+      "loss": 1.5362,
+      "step": 2380
+    },
+    {
+      "epoch": 1.4275048529192176,
+      "grad_norm": 0.052804794162511826,
+      "learning_rate": 5.024304056856039e-05,
+      "loss": 1.5299,
+      "step": 2390
+    },
+    {
+      "epoch": 1.4334776765715993,
+      "grad_norm": 0.051046222448349,
+      "learning_rate": 4.997398871358928e-05,
+      "loss": 1.529,
+      "step": 2400
+    },
+    {
+      "epoch": 1.4394505002239808,
+      "grad_norm": 0.056139182299375534,
+      "learning_rate": 4.970445481960793e-05,
+      "loss": 1.5368,
+      "step": 2410
+    },
+    {
+      "epoch": 1.4454233238763625,
+      "grad_norm": 0.04890932887792587,
+      "learning_rate": 4.9434451913085e-05,
+      "loss": 1.5308,
+      "step": 2420
+    },
+    {
+      "epoch": 1.4513961475287442,
+      "grad_norm": 0.04679281637072563,
+      "learning_rate": 4.916399304315636e-05,
+      "loss": 1.5353,
+      "step": 2430
+    },
+    {
+      "epoch": 1.457368971181126,
+      "grad_norm": 0.05536729097366333,
+      "learning_rate": 4.8893091280994415e-05,
+      "loss": 1.5314,
+      "step": 2440
+    },
+    {
+      "epoch": 1.4633417948335075,
+      "grad_norm": 0.04933058097958565,
+      "learning_rate": 4.862175971917637e-05,
+      "loss": 1.5301,
+      "step": 2450
+    },
+    {
+      "epoch": 1.4693146184858892,
+      "grad_norm": 0.05884556844830513,
+      "learning_rate": 4.835001147105148e-05,
+      "loss": 1.5213,
+      "step": 2460
+    },
+    {
+      "epoch": 1.475287442138271,
+      "grad_norm": 0.04465237259864807,
+      "learning_rate": 4.807785967010729e-05,
+      "loss": 1.5288,
+      "step": 2470
+    },
+    {
+      "epoch": 1.4812602657906524,
+      "grad_norm": 0.04548431187868118,
+      "learning_rate": 4.780531746933491e-05,
+      "loss": 1.5353,
+      "step": 2480
+    },
+    {
+      "epoch": 1.4872330894430341,
+      "grad_norm": 0.047798071056604385,
+      "learning_rate": 4.7532398040593295e-05,
+      "loss": 1.5261,
+      "step": 2490
+    },
+    {
+      "epoch": 1.4932059130954158,
+      "grad_norm": 0.05616561323404312,
+      "learning_rate": 4.7259114573972715e-05,
+      "loss": 1.5343,
+      "step": 2500
+    },
+    {
+      "epoch": 1.4991787367477976,
+      "grad_norm": 0.053861986845731735,
+      "learning_rate": 4.6985480277157215e-05,
+      "loss": 1.5249,
+      "step": 2510
+    },
+    {
+      "epoch": 1.5051515604001793,
+      "grad_norm": 0.05890486761927605,
+      "learning_rate": 4.671150837478634e-05,
+      "loss": 1.5357,
+      "step": 2520
+    },
+    {
+      "epoch": 1.511124384052561,
+      "grad_norm": 0.056382015347480774,
+      "learning_rate": 4.643721210781601e-05,
+      "loss": 1.5159,
+      "step": 2530
+    },
+    {
+      "epoch": 1.5170972077049425,
+      "grad_norm": 0.051396943628787994,
+      "learning_rate": 4.6162604732878515e-05,
+      "loss": 1.5301,
+      "step": 2540
+    },
+    {
+      "epoch": 1.5230700313573242,
+      "grad_norm": 0.04754629358649254,
+      "learning_rate": 4.588769952164191e-05,
+      "loss": 1.5277,
+      "step": 2550
+    },
+    {
+      "epoch": 1.5290428550097057,
+      "grad_norm": 0.0532587394118309,
+      "learning_rate": 4.561250976016851e-05,
+      "loss": 1.5201,
+      "step": 2560
+    },
+    {
+      "epoch": 1.5350156786620874,
+      "grad_norm": 0.059257134795188904,
+      "learning_rate": 4.5337048748272905e-05,
+      "loss": 1.5265,
+      "step": 2570
+    },
+    {
+      "epoch": 1.5409885023144692,
+      "grad_norm": 0.05495699495077133,
+      "learning_rate": 4.5061329798879064e-05,
+      "loss": 1.5247,
+      "step": 2580
+    },
+    {
+      "epoch": 1.5469613259668509,
+      "grad_norm": 0.04833153635263443,
+      "learning_rate": 4.478536623737699e-05,
+      "loss": 1.5291,
+      "step": 2590
+    },
+    {
+      "epoch": 1.5529341496192326,
+      "grad_norm": 0.048605091869831085,
+      "learning_rate": 4.450917140097869e-05,
+      "loss": 1.5277,
+      "step": 2600
+    },
+    {
+      "epoch": 1.5589069732716143,
+      "grad_norm": 0.06368768960237503,
+      "learning_rate": 4.4232758638073585e-05,
+      "loss": 1.5306,
+      "step": 2610
+    },
+    {
+      "epoch": 1.5648797969239958,
+      "grad_norm": 0.04569351673126221,
+      "learning_rate": 4.395614130758344e-05,
+      "loss": 1.5208,
+      "step": 2620
+    },
+    {
+      "epoch": 1.5708526205763775,
+      "grad_norm": 0.07877717167139053,
+      "learning_rate": 4.367933277831666e-05,
+      "loss": 1.5152,
+      "step": 2630
+    },
+    {
+      "epoch": 1.576825444228759,
+      "grad_norm": 0.05059320852160454,
+      "learning_rate": 4.34023464283222e-05,
+      "loss": 1.5199,
+      "step": 2640
+    },
+    {
+      "epoch": 1.5827982678811408,
+      "grad_norm": 0.05248813331127167,
+      "learning_rate": 4.312519564424306e-05,
+      "loss": 1.5236,
+      "step": 2650
+    },
+    {
+      "epoch": 1.5887710915335225,
+      "grad_norm": 0.051895346492528915,
+      "learning_rate": 4.2847893820669244e-05,
+      "loss": 1.5225,
+      "step": 2660
+    },
+    {
+      "epoch": 1.5947439151859042,
+      "grad_norm": 0.048129428178071976,
+      "learning_rate": 4.2570454359490455e-05,
+      "loss": 1.5259,
+      "step": 2670
+    },
+    {
+      "epoch": 1.600716738838286,
+      "grad_norm": 0.049009375274181366,
+      "learning_rate": 4.2292890669248364e-05,
+      "loss": 1.533,
+      "step": 2680
+    },
+    {
+      "epoch": 1.6066895624906674,
+      "grad_norm": 0.05925741046667099,
+      "learning_rate": 4.2015216164488575e-05,
+      "loss": 1.5242,
+      "step": 2690
+    },
+    {
+      "epoch": 1.6126623861430491,
+      "grad_norm": 0.051209457218647,
+      "learning_rate": 4.173744426511231e-05,
+      "loss": 1.5348,
+      "step": 2700
+    },
+    {
+      "epoch": 1.6186352097954306,
+      "grad_norm": 0.04731997102499008,
+      "learning_rate": 4.1459588395727876e-05,
+      "loss": 1.5179,
+      "step": 2710
+    },
+    {
+      "epoch": 1.6246080334478123,
+      "grad_norm": 0.04640951007604599,
+      "learning_rate": 4.118166198500178e-05,
+      "loss": 1.5218,
+      "step": 2720
+    },
+    {
+      "epoch": 1.630580857100194,
+      "grad_norm": 0.05060356855392456,
+      "learning_rate": 4.090367846500976e-05,
+      "loss": 1.5184,
+      "step": 2730
+    },
+    {
+      "epoch": 1.6365536807525758,
+      "grad_norm": 0.04525948315858841,
+      "learning_rate": 4.062565127058764e-05,
+      "loss": 1.5207,
+      "step": 2740
+    },
+    {
+      "epoch": 1.6425265044049575,
+      "grad_norm": 0.0447864904999733,
+      "learning_rate": 4.0347593838682016e-05,
+      "loss": 1.5265,
+      "step": 2750
+    },
+    {
+      "epoch": 1.6484993280573392,
+      "grad_norm": 0.06339412927627563,
+      "learning_rate": 4.006951960770084e-05,
+      "loss": 1.5296,
+      "step": 2760
+    },
+    {
+      "epoch": 1.6544721517097207,
+      "grad_norm": 0.05479173734784126,
+      "learning_rate": 3.979144201686396e-05,
+      "loss": 1.5167,
+      "step": 2770
+    },
+    {
+      "epoch": 1.6604449753621024,
+      "grad_norm": 0.05605393648147583,
+      "learning_rate": 3.951337450555361e-05,
+      "loss": 1.5208,
+      "step": 2780
+    },
+    {
+      "epoch": 1.666417799014484,
+      "grad_norm": 0.04500933736562729,
+      "learning_rate": 3.923533051266486e-05,
+      "loss": 1.5199,
+      "step": 2790
+    },
+    {
+      "epoch": 1.6723906226668657,
+      "grad_norm": 0.044439464807510376,
+      "learning_rate": 3.8957323475956165e-05,
+      "loss": 1.5254,
+      "step": 2800
+    },
+    {
+      "epoch": 1.6783634463192474,
+      "grad_norm": 0.051942795515060425,
+      "learning_rate": 3.867936683139991e-05,
+      "loss": 1.5168,
+      "step": 2810
+    },
+    {
+      "epoch": 1.684336269971629,
+      "grad_norm": 0.05696643143892288,
+      "learning_rate": 3.840147401253305e-05,
+      "loss": 1.5261,
+      "step": 2820
+    },
+    {
+      "epoch": 1.6903090936240108,
+      "grad_norm": 0.0423273928463459,
+      "learning_rate": 3.812365844980782e-05,
+      "loss": 1.5166,
+      "step": 2830
+    },
+    {
+      "epoch": 1.6962819172763925,
+      "grad_norm": 0.04251600056886673,
+      "learning_rate": 3.784593356994275e-05,
+      "loss": 1.514,
+      "step": 2840
+    },
+    {
+      "epoch": 1.702254740928774,
+      "grad_norm": 0.06778108328580856,
+      "learning_rate": 3.7568312795273675e-05,
+      "loss": 1.5161,
+      "step": 2850
+    },
+    {
+      "epoch": 1.7082275645811558,
+      "grad_norm": 0.046843383461236954,
+      "learning_rate": 3.729080954310509e-05,
+      "loss": 1.5215,
+      "step": 2860
+    },
+    {
+      "epoch": 1.7142003882335373,
+      "grad_norm": 0.04683705046772957,
+      "learning_rate": 3.701343722506164e-05,
+      "loss": 1.5191,
+      "step": 2870
+    },
+    {
+      "epoch": 1.720173211885919,
+      "grad_norm": 0.04883548244833946,
+      "learning_rate": 3.673620924644e-05,
+      "loss": 1.5175,
+      "step": 2880
+    },
+    {
+      "epoch": 1.7261460355383007,
+      "grad_norm": 0.047556836158037186,
+      "learning_rate": 3.6459139005560966e-05,
+      "loss": 1.5191,
+      "step": 2890
+    },
+    {
+      "epoch": 1.7321188591906824,
+      "grad_norm": 0.04096701368689537,
+      "learning_rate": 3.618223989312195e-05,
+      "loss": 1.5195,
+      "step": 2900
+    },
+    {
+      "epoch": 1.7380916828430641,
+      "grad_norm": 0.043791547417640686,
+      "learning_rate": 3.590552529154974e-05,
+      "loss": 1.5149,
+      "step": 2910
+    },
+    {
+      "epoch": 1.7440645064954459,
+      "grad_norm": 0.06429862976074219,
+      "learning_rate": 3.562900857435384e-05,
+      "loss": 1.5136,
+      "step": 2920
+    },
+    {
+      "epoch": 1.7500373301478274,
+      "grad_norm": 0.04811246693134308,
+      "learning_rate": 3.535270310548007e-05,
+      "loss": 1.5178,
+      "step": 2930
+    },
+    {
+      "epoch": 1.756010153800209,
+      "grad_norm": 0.05720449239015579,
+      "learning_rate": 3.5076622238664675e-05,
+      "loss": 1.5112,
+      "step": 2940
+    },
+    {
+      "epoch": 1.7619829774525906,
+      "grad_norm": 0.04717197269201279,
+      "learning_rate": 3.480077931678899e-05,
+      "loss": 1.5147,
+      "step": 2950
+    },
+    {
+      "epoch": 1.7679558011049723,
+      "grad_norm": 0.04889809712767601,
+      "learning_rate": 3.452518767123456e-05,
+      "loss": 1.5186,
+      "step": 2960
+    },
+    {
+      "epoch": 1.773928624757354,
+      "grad_norm": 0.055686600506305695,
+      "learning_rate": 3.424986062123883e-05,
+      "loss": 1.5105,
+      "step": 2970
+    },
+    {
+      "epoch": 1.7799014484097357,
+      "grad_norm": 0.045671623200178146,
+      "learning_rate": 3.397481147325146e-05,
+      "loss": 1.5236,
+      "step": 2980
+    },
+    {
+      "epoch": 1.7858742720621175,
+      "grad_norm": 0.0518915057182312,
+      "learning_rate": 3.370005352029122e-05,
+      "loss": 1.5082,
+      "step": 2990
+    },
+    {
+      "epoch": 1.7918470957144992,
+      "grad_norm": 0.0466337613761425,
+      "learning_rate": 3.342560004130351e-05,
+      "loss": 1.5246,
+      "step": 3000
+    },
+    {
+      "epoch": 1.7918470957144992,
+      "eval_loss": 1.5170252323150635,
+      "eval_runtime": 20.1093,
+      "eval_samples_per_second": 1722.235,
+      "eval_steps_per_second": 13.476,
+      "step": 3000
+    },
+    {
+      "epoch": 1.7978199193668807,
+      "grad_norm": 0.04238193854689598,
+      "learning_rate": 3.3151464300518634e-05,
+      "loss": 1.5097,
+      "step": 3010
+    },
+    {
+      "epoch": 1.8037927430192624,
+      "grad_norm": 0.050784409046173096,
+      "learning_rate": 3.2877659546810745e-05,
+      "loss": 1.5195,
+      "step": 3020
+    },
+    {
+      "epoch": 1.809765566671644,
+      "grad_norm": 0.04055749997496605,
+      "learning_rate": 3.260419901305751e-05,
+      "loss": 1.5171,
+      "step": 3030
+    },
+    {
+      "epoch": 1.8157383903240256,
+      "grad_norm": 0.05311364307999611,
+      "learning_rate": 3.2331095915500564e-05,
+      "loss": 1.5136,
+      "step": 3040
+    },
+    {
+      "epoch": 1.8217112139764073,
+      "grad_norm": 0.0499190054833889,
+      "learning_rate": 3.205836345310681e-05,
+      "loss": 1.5081,
+      "step": 3050
+    },
+    {
+      "epoch": 1.827684037628789,
+      "grad_norm": 0.056762441992759705,
+      "learning_rate": 3.178601480693048e-05,
+      "loss": 1.5243,
+      "step": 3060
+    },
+    {
+      "epoch": 1.8336568612811708,
+      "grad_norm": 0.04753740131855011,
+      "learning_rate": 3.151406313947615e-05,
+      "loss": 1.5069,
+      "step": 3070
+    },
+    {
+      "epoch": 1.8396296849335525,
+      "grad_norm": 0.054608915001153946,
+      "learning_rate": 3.124252159406251e-05,
+      "loss": 1.5172,
+      "step": 3080
+    },
+    {
+      "epoch": 1.845602508585934,
+      "grad_norm": 0.04840042069554329,
+      "learning_rate": 3.097140329418726e-05,
+      "loss": 1.5126,
+      "step": 3090
+    },
+    {
+      "epoch": 1.8515753322383157,
+      "grad_norm": 0.05584624037146568,
+      "learning_rate": 3.07007213428928e-05,
+      "loss": 1.5091,
+      "step": 3100
+    },
+    {
+      "epoch": 1.8575481558906972,
+      "grad_norm": 0.0425049252808094,
+      "learning_rate": 3.0430488822132957e-05,
+      "loss": 1.5155,
+      "step": 3110
+    },
+    {
+      "epoch": 1.863520979543079,
+      "grad_norm": 0.043588876724243164,
+      "learning_rate": 3.016071879214077e-05,
+      "loss": 1.5099,
+      "step": 3120
+    },
+    {
+      "epoch": 1.8694938031954607,
+      "grad_norm": 0.041503310203552246,
+      "learning_rate": 2.989142429079725e-05,
+      "loss": 1.509,
+      "step": 3130
+    },
+    {
+      "epoch": 1.8754666268478424,
+      "grad_norm": 0.04797055944800377,
+      "learning_rate": 2.962261833300133e-05,
+      "loss": 1.507,
+      "step": 3140
+    },
+    {
+      "epoch": 1.881439450500224,
+      "grad_norm": 0.05003626272082329,
+      "learning_rate": 2.935431391004081e-05,
+      "loss": 1.5177,
+      "step": 3150
+    },
+    {
+      "epoch": 1.8874122741526056,
+      "grad_norm": 0.04475341737270355,
+      "learning_rate": 2.9086523988964478e-05,
+      "loss": 1.5077,
+      "step": 3160
+    },
+    {
+      "epoch": 1.8933850978049873,
+      "grad_norm": 0.04602671042084694,
+      "learning_rate": 2.881926151195547e-05,
+      "loss": 1.5037,
+      "step": 3170
+    },
+    {
+      "epoch": 1.8993579214573688,
+      "grad_norm": 0.04945210739970207,
+      "learning_rate": 2.855253939570578e-05,
+      "loss": 1.503,
+      "step": 3180
+    },
+    {
+      "epoch": 1.9053307451097505,
+      "grad_norm": 0.04730582609772682,
+      "learning_rate": 2.8286370530791914e-05,
+      "loss": 1.5064,
+      "step": 3190
+    },
+    {
+      "epoch": 1.9113035687621323,
+      "grad_norm": 0.05128956586122513,
+      "learning_rate": 2.8020767781052016e-05,
+      "loss": 1.5126,
+      "step": 3200
+    },
+    {
+      "epoch": 1.917276392414514,
+      "grad_norm": 0.055559854954481125,
+      "learning_rate": 2.7755743982964066e-05,
+      "loss": 1.5052,
+      "step": 3210
+    },
+    {
+      "epoch": 1.9232492160668957,
+      "grad_norm": 0.036298781633377075,
+      "learning_rate": 2.749131194502555e-05,
+      "loss": 1.5092,
+      "step": 3220
+    },
+    {
+      "epoch": 1.9292220397192774,
+      "grad_norm": 0.042619943618774414,
+      "learning_rate": 2.7227484447134398e-05,
+      "loss": 1.5044,
+      "step": 3230
+    },
+    {
+      "epoch": 1.935194863371659,
+      "grad_norm": 0.052806805819272995,
+      "learning_rate": 2.696427423997138e-05,
+      "loss": 1.5056,
+      "step": 3240
+    },
+    {
+      "epoch": 1.9411676870240406,
+      "grad_norm": 0.044467948377132416,
+      "learning_rate": 2.670169404438383e-05,
+      "loss": 1.5114,
+      "step": 3250
+    },
+    {
+      "epoch": 1.9471405106764221,
+      "grad_norm": 0.038638997822999954,
+      "learning_rate": 2.6439756550770872e-05,
+      "loss": 1.5154,
+      "step": 3260
+    },
+    {
+      "epoch": 1.9531133343288039,
+      "grad_norm": 0.04845379292964935,
+      "learning_rate": 2.617847441847007e-05,
+      "loss": 1.51,
+      "step": 3270
+    },
+    {
+      "epoch": 1.9590861579811856,
+      "grad_norm": 0.0445607528090477,
+      "learning_rate": 2.5917860275145658e-05,
+      "loss": 1.5047,
+      "step": 3280
+    },
+    {
+      "epoch": 1.9650589816335673,
+      "grad_norm": 0.045905206352472305,
+      "learning_rate": 2.5657926716178217e-05,
+      "loss": 1.5118,
+      "step": 3290
+    },
+    {
+      "epoch": 1.971031805285949,
+      "grad_norm": 0.04530317336320877,
+      "learning_rate": 2.539868630405594e-05,
+      "loss": 1.5099,
+      "step": 3300
+    },
+    {
+      "epoch": 1.9770046289383307,
+      "grad_norm": 0.04195258021354675,
+      "learning_rate": 2.5140151567767505e-05,
+      "loss": 1.5075,
+      "step": 3310
+    },
+    {
+      "epoch": 1.9829774525907122,
+      "grad_norm": 0.043815840035676956,
+      "learning_rate": 2.4882335002196553e-05,
+      "loss": 1.5096,
+      "step": 3320
+    },
+    {
+      "epoch": 1.988950276243094,
+      "grad_norm": 0.04683714732527733,
+      "learning_rate": 2.4625249067517803e-05,
+      "loss": 1.5057,
+      "step": 3330
+    },
+    {
+      "epoch": 1.9949230998954754,
+      "grad_norm": 0.049690209329128265,
+      "learning_rate": 2.4368906188594877e-05,
+      "loss": 1.5106,
+      "step": 3340
+    },
+    {
+      "epoch": 2.000895923547857,
+      "grad_norm": 0.048324376344680786,
+      "learning_rate": 2.4113318754379816e-05,
+      "loss": 1.5042,
+      "step": 3350
+    },
+    {
+      "epoch": 2.006868747200239,
+      "grad_norm": 0.05503029376268387,
+      "learning_rate": 2.385849911731426e-05,
+      "loss": 1.4922,
+      "step": 3360
+    },
+    {
+      "epoch": 2.0128415708526206,
+      "grad_norm": 0.049435921013355255,
+      "learning_rate": 2.360445959273255e-05,
+      "loss": 1.4962,
+      "step": 3370
+    },
+    {
+      "epoch": 2.0188143945050023,
+      "grad_norm": 0.05086649954319,
+      "learning_rate": 2.3351212458266512e-05,
+      "loss": 1.4918,
+      "step": 3380
+    },
+    {
+      "epoch": 2.024787218157384,
+      "grad_norm": 0.045887332409620285,
+      "learning_rate": 2.3098769953252002e-05,
+      "loss": 1.4868,
+      "step": 3390
+    },
+    {
+      "epoch": 2.0307600418097658,
+      "grad_norm": 0.04303443059325218,
+      "learning_rate": 2.2847144278137502e-05,
+      "loss": 1.4982,
+      "step": 3400
+    },
+    {
+      "epoch": 2.036732865462147,
+      "grad_norm": 0.043649692088365555,
+      "learning_rate": 2.2596347593894387e-05,
+      "loss": 1.5,
+      "step": 3410
+    },
+    {
+      "epoch": 2.0427056891145288,
+      "grad_norm": 0.04276139661669731,
+      "learning_rate": 2.2346392021429254e-05,
+      "loss": 1.4903,
+      "step": 3420
+    },
+    {
+      "epoch": 2.0486785127669105,
+      "grad_norm": 0.04298582300543785,
+      "learning_rate": 2.2097289640998074e-05,
+      "loss": 1.5032,
+      "step": 3430
+    },
+    {
+      "epoch": 2.054651336419292,
+      "grad_norm": 0.053750213235616684,
+      "learning_rate": 2.1849052491622374e-05,
+      "loss": 1.4942,
+      "step": 3440
+    },
+    {
+      "epoch": 2.060624160071674,
+      "grad_norm": 0.042636483907699585,
+      "learning_rate": 2.160169257050742e-05,
+      "loss": 1.4976,
+      "step": 3450
+    },
+    {
+      "epoch": 2.0665969837240556,
+      "grad_norm": 0.05124128982424736,
+      "learning_rate": 2.135522183246237e-05,
+      "loss": 1.4981,
+      "step": 3460
+    },
+    {
+      "epoch": 2.0725698073764374,
+      "grad_norm": 0.047978244721889496,
+      "learning_rate": 2.110965218932247e-05,
+      "loss": 1.4975,
+      "step": 3470
+    },
+    {
+      "epoch": 2.078542631028819,
+      "grad_norm": 0.045476969331502914,
+      "learning_rate": 2.0864995509373448e-05,
+      "loss": 1.4958,
+      "step": 3480
+    },
+    {
+      "epoch": 2.0845154546812004,
+      "grad_norm": 0.05264231190085411,
+      "learning_rate": 2.062126361677786e-05,
+      "loss": 1.4996,
+      "step": 3490
+    },
+    {
+      "epoch": 2.090488278333582,
+      "grad_norm": 0.05144358426332474,
+      "learning_rate": 2.037846829100364e-05,
+      "loss": 1.5077,
+      "step": 3500
+    },
+    {
+      "epoch": 2.096461101985964,
+      "grad_norm": 0.048265036195516586,
+      "learning_rate": 2.013662126625482e-05,
+      "loss": 1.4987,
+      "step": 3510
+    },
+    {
+      "epoch": 2.1024339256383455,
+      "grad_norm": 0.04586884751915932,
+      "learning_rate": 1.9895734230904396e-05,
+      "loss": 1.5044,
+      "step": 3520
+    },
+    {
+      "epoch": 2.1084067492907272,
+      "grad_norm": 0.03930211812257767,
+      "learning_rate": 1.965581882692949e-05,
+      "loss": 1.4951,
+      "step": 3530
+    },
+    {
+      "epoch": 2.114379572943109,
+      "grad_norm": 0.051928870379924774,
+      "learning_rate": 1.9416886649348575e-05,
+      "loss": 1.4962,
+      "step": 3540
+    },
+    {
+      "epoch": 2.1203523965954907,
+      "grad_norm": 0.04466070607304573,
+      "learning_rate": 1.917894924566125e-05,
+      "loss": 1.4874,
+      "step": 3550
+    },
+    {
+      "epoch": 2.126325220247872,
+      "grad_norm": 0.044879212975502014,
+      "learning_rate": 1.8942018115290063e-05,
+      "loss": 1.4896,
+      "step": 3560
+    },
+    {
+      "epoch": 2.1322980439002537,
+      "grad_norm": 0.04508794844150543,
+      "learning_rate": 1.8706104709024715e-05,
+      "loss": 1.4915,
+      "step": 3570
+    },
+    {
+      "epoch": 2.1382708675526354,
+      "grad_norm": 0.06577686965465546,
+      "learning_rate": 1.8471220428468745e-05,
+      "loss": 1.4981,
+      "step": 3580
+    },
+    {
+      "epoch": 2.144243691205017,
+      "grad_norm": 0.03995177894830704,
+      "learning_rate": 1.823737662548843e-05,
+      "loss": 1.4973,
+      "step": 3590
+    },
+    {
+      "epoch": 2.150216514857399,
+      "grad_norm": 0.06114717572927475,
+      "learning_rate": 1.800458460166417e-05,
+      "loss": 1.4942,
+      "step": 3600
+    },
+    {
+      "epoch": 2.1561893385097806,
+      "grad_norm": 0.04745366424322128,
+      "learning_rate": 1.7772855607744284e-05,
+      "loss": 1.5004,
+      "step": 3610
+    },
+    {
+      "epoch": 2.1621621621621623,
+      "grad_norm": 0.045220714062452316,
+      "learning_rate": 1.7542200843101267e-05,
+      "loss": 1.494,
+      "step": 3620
+    },
+    {
+      "epoch": 2.168134985814544,
+      "grad_norm": 0.04914199188351631,
+      "learning_rate": 1.7312631455190528e-05,
+      "loss": 1.491,
+      "step": 3630
+    },
+    {
+      "epoch": 2.1741078094669257,
+      "grad_norm": 0.044854309409856796,
+      "learning_rate": 1.708415853901166e-05,
+      "loss": 1.4974,
+      "step": 3640
+    },
+    {
+      "epoch": 2.180080633119307,
+      "grad_norm": 0.0511915348470211,
+      "learning_rate": 1.6856793136572155e-05,
+      "loss": 1.4978,
+      "step": 3650
+    },
+    {
+      "epoch": 2.1860534567716887,
+      "grad_norm": 0.052235160022974014,
+      "learning_rate": 1.6630546236353833e-05,
+      "loss": 1.4884,
+      "step": 3660
+    },
+    {
+      "epoch": 2.1920262804240704,
+      "grad_norm": 0.03959416225552559,
+      "learning_rate": 1.6405428772781724e-05,
+      "loss": 1.4897,
+      "step": 3670
+    },
+    {
+      "epoch": 2.197999104076452,
+      "grad_norm": 0.04642707481980324,
+      "learning_rate": 1.618145162569563e-05,
+      "loss": 1.489,
+      "step": 3680
+    },
+    {
+      "epoch": 2.203971927728834,
+      "grad_norm": 0.05590491741895676,
+      "learning_rate": 1.5958625619824286e-05,
+      "loss": 1.4946,
+      "step": 3690
+    },
+    {
+      "epoch": 2.2099447513812156,
+      "grad_norm": 0.050484009087085724,
+      "learning_rate": 1.5736961524262232e-05,
+      "loss": 1.5011,
+      "step": 3700
+    },
+    {
+      "epoch": 2.2159175750335973,
+      "grad_norm": 0.04109204187989235,
+      "learning_rate": 1.551647005194932e-05,
+      "loss": 1.4993,
+      "step": 3710
+    },
+    {
+      "epoch": 2.2218903986859786,
+      "grad_norm": 0.04570942744612694,
+      "learning_rate": 1.5297161859152986e-05,
+      "loss": 1.491,
+      "step": 3720
+    },
+    {
+      "epoch": 2.2278632223383603,
+      "grad_norm": 0.041420578956604004,
+      "learning_rate": 1.5079047544953227e-05,
+      "loss": 1.4874,
+      "step": 3730
+    },
+    {
+      "epoch": 2.233836045990742,
+      "grad_norm": 0.04918381944298744,
+      "learning_rate": 1.486213765073032e-05,
+      "loss": 1.4939,
+      "step": 3740
+    },
+    {
+      "epoch": 2.2398088696431238,
+      "grad_norm": 0.05086056888103485,
+      "learning_rate": 1.4646442659655425e-05,
+      "loss": 1.4992,
+      "step": 3750
+    },
+    {
+      "epoch": 2.2457816932955055,
+      "grad_norm": 0.061345502734184265,
+      "learning_rate": 1.4431972996183894e-05,
+      "loss": 1.4935,
+      "step": 3760
+    },
+    {
+      "epoch": 2.251754516947887,
+      "grad_norm": 0.03802775219082832,
+      "learning_rate": 1.4218739025551469e-05,
+      "loss": 1.487,
+      "step": 3770
+    },
+    {
+      "epoch": 2.257727340600269,
+      "grad_norm": 0.039830368012189865,
+      "learning_rate": 1.4006751053273338e-05,
+      "loss": 1.4943,
+      "step": 3780
+    },
+    {
+      "epoch": 2.2637001642526506,
+      "grad_norm": 0.04441362991929054,
+      "learning_rate": 1.3796019324646062e-05,
+      "loss": 1.4907,
+      "step": 3790
+    },
+    {
+      "epoch": 2.269672987905032,
+      "grad_norm": 0.04267200455069542,
+      "learning_rate": 1.358655402425245e-05,
+      "loss": 1.4905,
+      "step": 3800
+    },
+    {
+      "epoch": 2.2756458115574136,
+      "grad_norm": 0.04467471316456795,
+      "learning_rate": 1.3378365275469322e-05,
+      "loss": 1.4865,
+      "step": 3810
+    },
+    {
+      "epoch": 2.2816186352097954,
+      "grad_norm": 0.04877958446741104,
+      "learning_rate": 1.3171463139978222e-05,
+      "loss": 1.4978,
+      "step": 3820
+    },
+    {
+      "epoch": 2.287591458862177,
+      "grad_norm": 0.04458734765648842,
+      "learning_rate": 1.2965857617279216e-05,
+      "loss": 1.4931,
+      "step": 3830
+    },
+    {
+      "epoch": 2.293564282514559,
+      "grad_norm": 0.043027278035879135,
+      "learning_rate": 1.2761558644207547e-05,
+      "loss": 1.495,
+      "step": 3840
+    },
+    {
+      "epoch": 2.2995371061669405,
+      "grad_norm": 0.03808119520545006,
+      "learning_rate": 1.2558576094453435e-05,
+      "loss": 1.4922,
+      "step": 3850
+    },
+    {
+      "epoch": 2.3055099298193222,
+      "grad_norm": 0.038997333496809006,
+      "learning_rate": 1.2356919778084867e-05,
+      "loss": 1.4915,
+      "step": 3860
+    },
+    {
+      "epoch": 2.3114827534717035,
+      "grad_norm": 0.04020654410123825,
+      "learning_rate": 1.2156599441073488e-05,
+      "loss": 1.4874,
+      "step": 3870
+    },
+    {
+      "epoch": 2.3174555771240852,
+      "grad_norm": 0.04891055077314377,
+      "learning_rate": 1.1957624764823566e-05,
+      "loss": 1.5016,
+      "step": 3880
+    },
+    {
+      "epoch": 2.323428400776467,
+      "grad_norm": 0.046524520963430405,
+      "learning_rate": 1.176000536570412e-05,
+      "loss": 1.4928,
+      "step": 3890
+    },
+    {
+      "epoch": 2.3294012244288487,
+      "grad_norm": 0.04302162304520607,
+      "learning_rate": 1.1563750794584156e-05,
+      "loss": 1.4905,
+      "step": 3900
+    },
+    {
+      "epoch": 2.3353740480812304,
+      "grad_norm": 0.046545591205358505,
+      "learning_rate": 1.1368870536371036e-05,
+      "loss": 1.4911,
+      "step": 3910
+    },
+    {
+      "epoch": 2.341346871733612,
+      "grad_norm": 0.04680660367012024,
+      "learning_rate": 1.1175374009552159e-05,
+      "loss": 1.4832,
+      "step": 3920
+    },
+    {
+      "epoch": 2.347319695385994,
+      "grad_norm": 0.04679818078875542,
+      "learning_rate": 1.0983270565739668e-05,
+      "loss": 1.4892,
+      "step": 3930
+    },
+    {
+      "epoch": 2.3532925190383756,
+      "grad_norm": 0.04409361630678177,
+      "learning_rate": 1.0792569489218598e-05,
+      "loss": 1.4907,
+      "step": 3940
+    },
+    {
+      "epoch": 2.3592653426907573,
+      "grad_norm": 0.04122375324368477,
+      "learning_rate": 1.0603279996498089e-05,
+      "loss": 1.4936,
+      "step": 3950
+    },
+    {
+      "epoch": 2.3652381663431385,
+      "grad_norm": 0.045084912329912186,
+      "learning_rate": 1.0415411235865979e-05,
+      "loss": 1.4852,
+      "step": 3960
+    },
+    {
+      "epoch": 2.3712109899955203,
+      "grad_norm": 0.04110685735940933,
+      "learning_rate": 1.0228972286946695e-05,
+      "loss": 1.494,
+      "step": 3970
+    },
+    {
+      "epoch": 2.377183813647902,
+      "grad_norm": 0.04527169466018677,
+      "learning_rate": 1.0043972160262392e-05,
+      "loss": 1.4955,
+      "step": 3980
+    },
+    {
+      "epoch": 2.3831566373002837,
+      "grad_norm": 0.04808187112212181,
+      "learning_rate": 9.860419796797527e-06,
+      "loss": 1.4858,
+      "step": 3990
+    },
+    {
+      "epoch": 2.3891294609526654,
+      "grad_norm": 0.03969137370586395,
+      "learning_rate": 9.678324067566716e-06,
+      "loss": 1.497,
+      "step": 4000
+    },
+    {
+      "epoch": 2.3891294609526654,
+      "eval_loss": 1.4980565309524536,
+      "eval_runtime": 20.0226,
+      "eval_samples_per_second": 1729.697,
+      "eval_steps_per_second": 13.535,
+      "step": 4000
+    },
+    {
+      "epoch": 2.395102284605047,
+      "grad_norm": 0.039191678166389465,
+      "learning_rate": 9.497693773185985e-06,
+      "loss": 1.491,
+      "step": 4010
+    },
+    {
+      "epoch": 2.401075108257429,
+      "grad_norm": 0.04326602816581726,
+      "learning_rate": 9.318537643447488e-06,
+      "loss": 1.4897,
+      "step": 4020
+    },
+    {
+      "epoch": 2.40704793190981,
+      "grad_norm": 0.04062432423233986,
+      "learning_rate": 9.140864336897559e-06,
+      "loss": 1.4834,
+      "step": 4030
+    },
+    {
+      "epoch": 2.413020755562192,
+      "grad_norm": 0.043511949479579926,
+      "learning_rate": 8.964682440418272e-06,
+      "loss": 1.4899,
+      "step": 4040
+    },
+    {
+      "epoch": 2.4189935792145736,
+      "grad_norm": 0.041364822536706924,
+      "learning_rate": 8.79000046881242e-06,
+      "loss": 1.4876,
+      "step": 4050
+    },
+    {
+      "epoch": 2.4249664028669553,
+      "grad_norm": 0.03720170632004738,
+      "learning_rate": 8.61682686439202e-06,
+      "loss": 1.4926,
+      "step": 4060
+    },
+    {
+      "epoch": 2.430939226519337,
+      "grad_norm": 0.04620780423283577,
+      "learning_rate": 8.44516999657027e-06,
+      "loss": 1.4929,
+      "step": 4070
+    },
+    {
+      "epoch": 2.4369120501717187,
+      "grad_norm": 0.03785783797502518,
+      "learning_rate": 8.275038161457094e-06,
+      "loss": 1.4917,
+      "step": 4080
+    },
+    {
+      "epoch": 2.4428848738241005,
+      "grad_norm": 0.047655072063207626,
+      "learning_rate": 8.106439581458177e-06,
+      "loss": 1.4923,
+      "step": 4090
+    },
+    {
+      "epoch": 2.448857697476482,
+      "grad_norm": 0.04838723689317703,
+      "learning_rate": 7.939382404877545e-06,
+      "loss": 1.4902,
+      "step": 4100
+    },
+    {
+      "epoch": 2.454830521128864,
+      "grad_norm": 0.0498916357755661,
+      "learning_rate": 7.773874705523826e-06,
+      "loss": 1.4846,
+      "step": 4110
+    },
+    {
+      "epoch": 2.460803344781245,
+      "grad_norm": 0.044865112751722336,
+      "learning_rate": 7.609924482320013e-06,
+      "loss": 1.4867,
+      "step": 4120
+    },
+    {
+      "epoch": 2.466776168433627,
+      "grad_norm": 0.041775912046432495,
+      "learning_rate": 7.447539658916869e-06,
+      "loss": 1.4869,
+      "step": 4130
+    },
+    {
+      "epoch": 2.4727489920860086,
+      "grad_norm": 0.03888450190424919,
+      "learning_rate": 7.286728083309995e-06,
+      "loss": 1.4824,
+      "step": 4140
+    },
+    {
+      "epoch": 2.4787218157383903,
+      "grad_norm": 0.05169163644313812,
+      "learning_rate": 7.127497527460541e-06,
+      "loss": 1.4856,
+      "step": 4150
+    },
+    {
+      "epoch": 2.484694639390772,
+      "grad_norm": 0.04095705598592758,
+      "learning_rate": 6.969855686919573e-06,
+      "loss": 1.4899,
+      "step": 4160
+    },
+    {
+      "epoch": 2.490667463043154,
+      "grad_norm": 0.0429367758333683,
+      "learning_rate": 6.81381018045618e-06,
+      "loss": 1.4848,
+      "step": 4170
+    },
+    {
+      "epoch": 2.4966402866955355,
+      "grad_norm": 0.04392432048916817,
+      "learning_rate": 6.659368549689209e-06,
+      "loss": 1.4832,
+      "step": 4180
+    },
+    {
+      "epoch": 2.502613110347917,
+      "grad_norm": 0.04673699662089348,
+      "learning_rate": 6.506538258722859e-06,
+      "loss": 1.4855,
+      "step": 4190
+    },
+    {
+      "epoch": 2.5085859340002985,
+      "grad_norm": 0.04074994474649429,
+      "learning_rate": 6.355326693785868e-06,
+      "loss": 1.4789,
+      "step": 4200
+    },
+    {
+      "epoch": 2.51455875765268,
+      "grad_norm": 0.035382091999053955,
+      "learning_rate": 6.2057411628745875e-06,
+      "loss": 1.4862,
+      "step": 4210
+    },
+    {
+      "epoch": 2.520531581305062,
+      "grad_norm": 0.03829929605126381,
+      "learning_rate": 6.057788895399781e-06,
+      "loss": 1.4852,
+      "step": 4220
+    },
+    {
+      "epoch": 2.5265044049574437,
+      "grad_norm": 0.04219154641032219,
+      "learning_rate": 5.9114770418372015e-06,
+      "loss": 1.4865,
+      "step": 4230
+    },
+    {
+      "epoch": 2.5324772286098254,
+      "grad_norm": 0.04591584950685501,
+      "learning_rate": 5.7668126733820476e-06,
+      "loss": 1.4737,
+      "step": 4240
+    },
+    {
+      "epoch": 2.538450052262207,
+      "grad_norm": 0.045854389667510986,
+      "learning_rate": 5.623802781607204e-06,
+      "loss": 1.4872,
+      "step": 4250
+    },
+    {
+      "epoch": 2.544422875914589,
+      "grad_norm": 0.04153481870889664,
+      "learning_rate": 5.48245427812534e-06,
+      "loss": 1.4806,
+      "step": 4260
+    },
+    {
+      "epoch": 2.5503956995669705,
+      "grad_norm": 0.03822470083832741,
+      "learning_rate": 5.342773994254842e-06,
+      "loss": 1.4792,
+      "step": 4270
+    },
+    {
+      "epoch": 2.556368523219352,
+      "grad_norm": 0.03870686888694763,
+      "learning_rate": 5.204768680689727e-06,
+      "loss": 1.4771,
+      "step": 4280
+    },
+    {
+      "epoch": 2.5623413468717335,
+      "grad_norm": 0.05567542836070061,
+      "learning_rate": 5.068445007173331e-06,
+      "loss": 1.4812,
+      "step": 4290
+    },
+    {
+      "epoch": 2.5683141705241153,
+      "grad_norm": 0.03914303705096245,
+      "learning_rate": 4.933809562175982e-06,
+      "loss": 1.4952,
+      "step": 4300
+    },
+    {
+      "epoch": 2.574286994176497,
+      "grad_norm": 0.04728810861706734,
+      "learning_rate": 4.800868852576561e-06,
+      "loss": 1.4813,
+      "step": 4310
+    },
+    {
+      "epoch": 2.5802598178288787,
+      "grad_norm": 0.04394581541419029,
+      "learning_rate": 4.669629303348066e-06,
+      "loss": 1.4779,
+      "step": 4320
+    },
+    {
+      "epoch": 2.5862326414812604,
+      "grad_norm": 0.042139682918787,
+      "learning_rate": 4.540097257247062e-06,
+      "loss": 1.4847,
+      "step": 4330
+    },
+    {
+      "epoch": 2.5922054651336417,
+      "grad_norm": 0.04580564424395561,
+      "learning_rate": 4.412278974507151e-06,
+      "loss": 1.4767,
+      "step": 4340
+    },
+    {
+      "epoch": 2.5981782887860234,
+      "grad_norm": 0.03395635262131691,
+      "learning_rate": 4.286180632536421e-06,
+      "loss": 1.4871,
+      "step": 4350
+    },
+    {
+      "epoch": 2.604151112438405,
+      "grad_norm": 0.04606311395764351,
+      "learning_rate": 4.161808325618886e-06,
+      "loss": 1.4865,
+      "step": 4360
+    },
+    {
+      "epoch": 2.610123936090787,
+      "grad_norm": 0.046741172671318054,
+      "learning_rate": 4.039168064619938e-06,
+      "loss": 1.4896,
+      "step": 4370
+    },
+    {
+      "epoch": 2.6160967597431686,
+      "grad_norm": 0.04130960628390312,
+      "learning_rate": 3.918265776695891e-06,
+      "loss": 1.4837,
+      "step": 4380
+    },
+    {
+      "epoch": 2.6220695833955503,
+      "grad_norm": 0.043055951595306396,
+      "learning_rate": 3.7991073050074678e-06,
+      "loss": 1.4841,
+      "step": 4390
+    },
+    {
+      "epoch": 2.628042407047932,
+      "grad_norm": 0.04418269917368889,
+      "learning_rate": 3.6816984084374485e-06,
+      "loss": 1.4831,
+      "step": 4400
+    },
+    {
+      "epoch": 2.6340152307003137,
+      "grad_norm": 0.036886971443891525,
+      "learning_rate": 3.5660447613123086e-06,
+      "loss": 1.4892,
+      "step": 4410
+    },
+    {
+      "epoch": 2.6399880543526955,
+      "grad_norm": 0.04421091824769974,
+      "learning_rate": 3.452151953128007e-06,
+      "loss": 1.4848,
+      "step": 4420
+    },
+    {
+      "epoch": 2.645960878005077,
+      "grad_norm": 0.042877208441495895,
+      "learning_rate": 3.3400254882798435e-06,
+      "loss": 1.4888,
+      "step": 4430
+    },
+    {
+      "epoch": 2.6519337016574585,
+      "grad_norm": 0.04234934598207474,
+      "learning_rate": 3.2296707857964125e-06,
+      "loss": 1.4796,
+      "step": 4440
+    },
+    {
+      "epoch": 2.65790652530984,
+      "grad_norm": 0.035217370837926865,
+      "learning_rate": 3.121093179077739e-06,
+      "loss": 1.481,
+      "step": 4450
+    },
+    {
+      "epoch": 2.663879348962222,
+      "grad_norm": 0.040508221834897995,
+      "learning_rate": 3.0142979156374806e-06,
+      "loss": 1.4819,
+      "step": 4460
+    },
+    {
+      "epoch": 2.6698521726146036,
+      "grad_norm": 0.041981033980846405,
+      "learning_rate": 2.9092901568493446e-06,
+      "loss": 1.4804,
+      "step": 4470
+    },
+    {
+      "epoch": 2.6758249962669853,
+      "grad_norm": 0.03790983185172081,
+      "learning_rate": 2.80607497769763e-06,
+      "loss": 1.4894,
+      "step": 4480
+    },
+    {
+      "epoch": 2.6817978199193666,
+      "grad_norm": 0.038940299302339554,
+      "learning_rate": 2.70465736653196e-06,
+      "loss": 1.4827,
+      "step": 4490
+    },
+    {
+      "epoch": 2.6877706435717483,
+      "grad_norm": 0.04031272605061531,
+      "learning_rate": 2.605042224826182e-06,
+      "loss": 1.4845,
+      "step": 4500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5022,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.9327446823064306e+19,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-4500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ab0d85bb1f90cdb54e5a3d9359ec0f6a3cdbed2af9fc2a0e31e87697e50efa4
+size 6712

checkpoint-4500/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-579/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/mnt/ddn/yrm/model/MMfreeLM-370M",
+  "architectures": [
+    "HGRNBitForCausalLM"
+  ],
+  "attn_mode": "fused_recurrent",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_ratio": 1,
+  "fuse_cross_entropy": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "max_position_embeddings": 2048,
+  "model_type": "hgrn_bit",
+  "num_heads": 1,
+  "num_hidden_layers": 24,
+  "rms_norm_eps": 1e-06,
+  "share_conv_kernel": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.2",
+  "use_cache": false,
+  "use_lower_bound": true,
+  "use_short_conv": false,
+  "vocab_size": 32000
+}