abanm commited on Dec 12, 2024

Commit

23d7c37

1 Parent(s): 0a415ab

Initial commit: Uploading project files

Browse files

Files changed (43) hide show

Dubs/v0.0.0/added_tokens.json +13 -0
Dubs/v0.0.0/config.json +153 -0
Dubs/v0.0.0/generation_config.json +11 -0
Dubs/v0.0.0/model.safetensors +3 -0
Dubs/v0.0.0/special_tokens_map.json +30 -0
Dubs/v0.0.0/tokenizer.json +0 -0
Dubs/v0.0.0/tokenizer.model +3 -0
Dubs/v0.0.0/tokenizer_config.json +132 -0
Dubs/v0.0.1/P_C/checkpoint-5900/README.md +202 -0
Dubs/v0.0.1/P_C/checkpoint-5900/adapter_config.json +32 -0
Dubs/v0.0.1/P_C/checkpoint-5900/adapter_model.safetensors +3 -0
Dubs/v0.0.1/P_C/checkpoint-5900/added_tokens.json +13 -0
Dubs/v0.0.1/P_C/checkpoint-5900/optimizer.pt +3 -0
Dubs/v0.0.1/P_C/checkpoint-5900/rng_state.pth +3 -0
Dubs/v0.0.1/P_C/checkpoint-5900/scheduler.pt +3 -0
Dubs/v0.0.1/P_C/checkpoint-5900/special_tokens_map.json +30 -0
Dubs/v0.0.1/P_C/checkpoint-5900/tokenizer.json +0 -0
Dubs/v0.0.1/P_C/checkpoint-5900/tokenizer.model +3 -0
Dubs/v0.0.1/P_C/checkpoint-5900/tokenizer_config.json +132 -0
Dubs/v0.0.1/P_C/checkpoint-5900/trainer_state.json +1685 -0
Dubs/v0.0.1/P_C/checkpoint-5900/training_args.bin +3 -0
Dubs/v0.0.1/P_C/runs/Dec12_06-02-48_a2e074eee72a/events.out.tfevents.1733983589.a2e074eee72a.7056.0 +3 -0
Dubs/v0.0.1/P_C/runs/Dec12_06-02-48_a2e074eee72a/events.out.tfevents.1733983648.a2e074eee72a.7056.1 +3 -0
Dubs/v0.0.1/P_C/runs/Dec12_06-10-33_a2e074eee72a/events.out.tfevents.1733983892.a2e074eee72a.9377.0 +3 -0
Dubs/v0.0.1/P_C/runs/Dec12_06-17-12_a2e074eee72a/events.out.tfevents.1733984297.a2e074eee72a.10985.0 +3 -0
Dubs/v0.0.1/adapter/README.md +202 -0
Dubs/v0.0.1/adapter/adapter_config.json +32 -0
Dubs/v0.0.1/adapter/adapter_model.safetensors +3 -0
Dubs/v0.0.1/adapter/added_tokens.json +13 -0
Dubs/v0.0.1/adapter/special_tokens_map.json +30 -0
Dubs/v0.0.1/adapter/tokenizer.json +0 -0
Dubs/v0.0.1/adapter/tokenizer.model +3 -0
Dubs/v0.0.1/adapter/tokenizer_config.json +132 -0
Dubs/v0.0.1/full_model/added_tokens.json +13 -0
Dubs/v0.0.1/full_model/config.json +139 -0
Dubs/v0.0.1/full_model/generation_config.json +11 -0
Dubs/v0.0.1/full_model/model-00001-of-00002.safetensors +3 -0
Dubs/v0.0.1/full_model/model-00002-of-00002.safetensors +3 -0
Dubs/v0.0.1/full_model/model.safetensors.index.json +202 -0
Dubs/v0.0.1/full_model/special_tokens_map.json +24 -0
Dubs/v0.0.1/full_model/tokenizer.json +0 -0
Dubs/v0.0.1/full_model/tokenizer.model +3 -0
Dubs/v0.0.1/full_model/tokenizer_config.json +132 -0

Dubs/v0.0.0/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

Dubs/v0.0.0/config.json ADDED Viewed

	@@ -0,0 +1,153 @@

+{
+  "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "phi3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 32000,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1.0700000524520874,
+      1.1200000047683716,
+      1.149999976158142,
+      1.4199999570846558,
+      1.5699999332427979,
+      1.7999999523162842,
+      2.129999876022339,
+      2.129999876022339,
+      3.009999990463257,
+      5.910000324249268,
+      6.950000286102295,
+      9.070000648498535,
+      9.930000305175781,
+      10.710000038146973,
+      11.130000114440918,
+      14.609999656677246,
+      15.409998893737793,
+      19.809999465942383,
+      37.279998779296875,
+      38.279998779296875,
+      38.599998474121094,
+      40.12000274658203,
+      46.20000457763672,
+      50.940006256103516,
+      53.66000747680664,
+      54.9373893737793,
+      56.89738845825195,
+      57.28738784790039,
+      59.98738479614258,
+      60.86738586425781,
+      60.887386322021484,
+      61.71739196777344,
+      62.91739273071289,
+      62.957393646240234,
+      63.41739273071289,
+      63.8173942565918,
+      63.83739471435547,
+      63.897396087646484,
+      63.93739700317383,
+      64.06739807128906,
+      64.11434936523438,
+      64.12435150146484,
+      64.15435028076172,
+      64.19435119628906,
+      64.24435424804688,
+      64.57435607910156,
+      64.69000244140625,
+      64.76000213623047
+    ],
+    "short_factor": [
+      1.1,
+      1.1,
+      1.1,
+      1.3000000000000003,
+      1.3500000000000003,
+      1.3500000000000003,
+      1.4000000000000004,
+      1.5500000000000005,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.1000000000000005,
+      2.1000000000000005,
+      2.1500000000000004,
+      2.25,
+      2.25,
+      2.25,
+      2.25,
+      2.25,
+      2.3999999999999995,
+      2.4499999999999993,
+      2.499999999999999,
+      2.6999999999999984,
+      2.6999999999999984,
+      2.7499999999999982,
+      2.799999999999998,
+      2.8999999999999977,
+      3.049999999999997
+    ],
+    "type": "longrope"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 262144,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 32064
+}

Dubs/v0.0.0/generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": [
+    32000,
+    32001,
+    32007
+  ],
+  "pad_token_id": 32000,
+  "transformers_version": "4.47.0"
+}

Dubs/v0.0.0/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30d951cab5124d0c1cc1dd0fc91dc1ea77f915d743e4487c60165e7239353e41
+size 2432922120

Dubs/v0.0.0/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Dubs/v0.0.0/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Dubs/v0.0.0/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

Dubs/v0.0.0/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

Dubs/v0.0.1/P_C/checkpoint-5900/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: microsoft/Phi-3-mini-128k-instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

Dubs/v0.0.1/P_C/checkpoint-5900/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "self_attn.qkv_proj",
+    "self_attn.o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Dubs/v0.0.1/P_C/checkpoint-5900/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6d33dd5e4c11ffdccca0885ca97ceacf5efdaf9a3c1afbdf1a4fc2d2c43813c
+size 151014592

Dubs/v0.0.1/P_C/checkpoint-5900/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

Dubs/v0.0.1/P_C/checkpoint-5900/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81f30a0ddb2755f61daf66f903dbe12a1d65c22b238e4a082d630794190591b4
+size 302069498

Dubs/v0.0.1/P_C/checkpoint-5900/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73f3161ada1a21f94b410f6d4af8520e48176439d1a40c69aa8fc1079a7b0862
+size 14244

Dubs/v0.0.1/P_C/checkpoint-5900/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b771924d1afb19146175598139018af62f2f0732d9d35762fdc1ce88ad379b7
+size 1064

Dubs/v0.0.1/P_C/checkpoint-5900/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Dubs/v0.0.1/P_C/checkpoint-5900/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Dubs/v0.0.1/P_C/checkpoint-5900/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

Dubs/v0.0.1/P_C/checkpoint-5900/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

Dubs/v0.0.1/P_C/checkpoint-5900/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1685 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 5900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0211864406779661,
+      "grad_norm": 0.3481608033180237,
+      "learning_rate": 2.11864406779661e-05,
+      "loss": 1.0844,
+      "step": 25
+    },
+    {
+      "epoch": 0.0423728813559322,
+      "grad_norm": 0.4737679064273834,
+      "learning_rate": 4.23728813559322e-05,
+      "loss": 0.968,
+      "step": 50
+    },
+    {
+      "epoch": 0.0635593220338983,
+      "grad_norm": 0.24385006725788116,
+      "learning_rate": 6.355932203389829e-05,
+      "loss": 0.7097,
+      "step": 75
+    },
+    {
+      "epoch": 0.0847457627118644,
+      "grad_norm": 0.32397332787513733,
+      "learning_rate": 8.47457627118644e-05,
+      "loss": 0.6995,
+      "step": 100
+    },
+    {
+      "epoch": 0.1059322033898305,
+      "grad_norm": 0.28883692622184753,
+      "learning_rate": 0.0001059322033898305,
+      "loss": 0.6362,
+      "step": 125
+    },
+    {
+      "epoch": 0.1271186440677966,
+      "grad_norm": 0.2939394414424896,
+      "learning_rate": 0.00012711864406779658,
+      "loss": 0.6646,
+      "step": 150
+    },
+    {
+      "epoch": 0.1483050847457627,
+      "grad_norm": 0.21506226062774658,
+      "learning_rate": 0.0001483050847457627,
+      "loss": 0.67,
+      "step": 175
+    },
+    {
+      "epoch": 0.1694915254237288,
+      "grad_norm": 0.24949543178081512,
+      "learning_rate": 0.00014999402230951556,
+      "loss": 0.6328,
+      "step": 200
+    },
+    {
+      "epoch": 0.1906779661016949,
+      "grad_norm": 0.1662125438451767,
+      "learning_rate": 0.00014997396600188487,
+      "loss": 0.6365,
+      "step": 225
+    },
+    {
+      "epoch": 0.211864406779661,
+      "grad_norm": 0.18493060767650604,
+      "learning_rate": 0.00014993978965384007,
+      "loss": 0.6661,
+      "step": 250
+    },
+    {
+      "epoch": 0.2330508474576271,
+      "grad_norm": 0.1724727302789688,
+      "learning_rate": 0.00014989149970190098,
+      "loss": 0.6453,
+      "step": 275
+    },
+    {
+      "epoch": 0.2542372881355932,
+      "grad_norm": 0.2265160232782364,
+      "learning_rate": 0.00014982910524063883,
+      "loss": 0.5802,
+      "step": 300
+    },
+    {
+      "epoch": 0.2754237288135593,
+      "grad_norm": 0.14229296147823334,
+      "learning_rate": 0.00014975261802096344,
+      "loss": 0.6559,
+      "step": 325
+    },
+    {
+      "epoch": 0.2966101694915254,
+      "grad_norm": 0.19628387689590454,
+      "learning_rate": 0.0001496620524479102,
+      "loss": 0.6181,
+      "step": 350
+    },
+    {
+      "epoch": 0.3177966101694915,
+      "grad_norm": 0.19808532297611237,
+      "learning_rate": 0.00014955742557792704,
+      "loss": 0.6363,
+      "step": 375
+    },
+    {
+      "epoch": 0.3389830508474576,
+      "grad_norm": 0.2479950338602066,
+      "learning_rate": 0.00014943875711566237,
+      "loss": 0.601,
+      "step": 400
+    },
+    {
+      "epoch": 0.3601694915254237,
+      "grad_norm": 0.18844148516654968,
+      "learning_rate": 0.0001493060694102537,
+      "loss": 0.6406,
+      "step": 425
+    },
+    {
+      "epoch": 0.3813559322033898,
+      "grad_norm": 0.21692270040512085,
+      "learning_rate": 0.00014915938745111896,
+      "loss": 0.674,
+      "step": 450
+    },
+    {
+      "epoch": 0.4025423728813559,
+      "grad_norm": 0.18362776935100555,
+      "learning_rate": 0.0001489987388632498,
+      "loss": 0.6326,
+      "step": 475
+    },
+    {
+      "epoch": 0.423728813559322,
+      "grad_norm": 0.1860133409500122,
+      "learning_rate": 0.0001488241539020092,
+      "loss": 0.6539,
+      "step": 500
+    },
+    {
+      "epoch": 0.4449152542372881,
+      "grad_norm": 0.16509853303432465,
+      "learning_rate": 0.00014863566544743326,
+      "loss": 0.6649,
+      "step": 525
+    },
+    {
+      "epoch": 0.4661016949152542,
+      "grad_norm": 0.17422816157341003,
+      "learning_rate": 0.0001484333089980388,
+      "loss": 0.6365,
+      "step": 550
+    },
+    {
+      "epoch": 0.4872881355932203,
+      "grad_norm": 0.16881784796714783,
+      "learning_rate": 0.000148217122664138,
+      "loss": 0.6014,
+      "step": 575
+    },
+    {
+      "epoch": 0.5084745762711864,
+      "grad_norm": 0.24150097370147705,
+      "learning_rate": 0.00014798714716066072,
+      "loss": 0.6225,
+      "step": 600
+    },
+    {
+      "epoch": 0.5296610169491526,
+      "grad_norm": 0.183096244931221,
+      "learning_rate": 0.00014774342579948675,
+      "loss": 0.628,
+      "step": 625
+    },
+    {
+      "epoch": 0.5508474576271186,
+      "grad_norm": 0.2092808037996292,
+      "learning_rate": 0.00014748600448128877,
+      "loss": 0.6196,
+      "step": 650
+    },
+    {
+      "epoch": 0.5720338983050848,
+      "grad_norm": 0.1650499850511551,
+      "learning_rate": 0.00014721493168688764,
+      "loss": 0.6617,
+      "step": 675
+    },
+    {
+      "epoch": 0.5932203389830508,
+      "grad_norm": 0.2336203157901764,
+      "learning_rate": 0.00014693025846812194,
+      "loss": 0.5995,
+      "step": 700
+    },
+    {
+      "epoch": 0.614406779661017,
+      "grad_norm": 0.1635483205318451,
+      "learning_rate": 0.0001466320384382333,
+      "loss": 0.6225,
+      "step": 725
+    },
+    {
+      "epoch": 0.635593220338983,
+      "grad_norm": 0.24543817341327667,
+      "learning_rate": 0.00014632032776176924,
+      "loss": 0.6208,
+      "step": 750
+    },
+    {
+      "epoch": 0.6567796610169492,
+      "grad_norm": 0.156394824385643,
+      "learning_rate": 0.0001459951851440055,
+      "loss": 0.6234,
+      "step": 775
+    },
+    {
+      "epoch": 0.6779661016949152,
+      "grad_norm": 0.21179354190826416,
+      "learning_rate": 0.00014565667181988995,
+      "loss": 0.6101,
+      "step": 800
+    },
+    {
+      "epoch": 0.6991525423728814,
+      "grad_norm": 0.1816495805978775,
+      "learning_rate": 0.00014530485154251021,
+      "loss": 0.6212,
+      "step": 825
+    },
+    {
+      "epoch": 0.7203389830508474,
+      "grad_norm": 0.18615126609802246,
+      "learning_rate": 0.0001449397905710866,
+      "loss": 0.6019,
+      "step": 850
+    },
+    {
+      "epoch": 0.7415254237288136,
+      "grad_norm": 0.13972151279449463,
+      "learning_rate": 0.00014456155765849355,
+      "loss": 0.6804,
+      "step": 875
+    },
+    {
+      "epoch": 0.7627118644067796,
+      "grad_norm": 0.19166871905326843,
+      "learning_rate": 0.00014417022403831117,
+      "loss": 0.6265,
+      "step": 900
+    },
+    {
+      "epoch": 0.7838983050847458,
+      "grad_norm": 0.1559162586927414,
+      "learning_rate": 0.00014376586341140955,
+      "loss": 0.5893,
+      "step": 925
+    },
+    {
+      "epoch": 0.8050847457627118,
+      "grad_norm": 0.17139187455177307,
+      "learning_rate": 0.0001433485519320687,
+      "loss": 0.6192,
+      "step": 950
+    },
+    {
+      "epoch": 0.826271186440678,
+      "grad_norm": 0.19588051736354828,
+      "learning_rate": 0.0001429183681936359,
+      "loss": 0.6545,
+      "step": 975
+    },
+    {
+      "epoch": 0.847457627118644,
+      "grad_norm": 0.17011399567127228,
+      "learning_rate": 0.0001424753932137243,
+      "loss": 0.6274,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8686440677966102,
+      "grad_norm": 0.13620993494987488,
+      "learning_rate": 0.00014201971041895455,
+      "loss": 0.6185,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8898305084745762,
+      "grad_norm": 0.19832104444503784,
+      "learning_rate": 0.00014155140562924286,
+      "loss": 0.5788,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9110169491525424,
+      "grad_norm": 0.15580902993679047,
+      "learning_rate": 0.00014107056704163823,
+      "loss": 0.6756,
+      "step": 1075
+    },
+    {
+      "epoch": 0.9322033898305084,
+      "grad_norm": 0.2072034329175949,
+      "learning_rate": 0.00014057728521371218,
+      "loss": 0.6347,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9533898305084746,
+      "grad_norm": 0.13679395616054535,
+      "learning_rate": 0.00014007165304650386,
+      "loss": 0.6419,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9745762711864406,
+      "grad_norm": 0.20975461602210999,
+      "learning_rate": 0.00013955376576702357,
+      "loss": 0.5929,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9957627118644068,
+      "grad_norm": 0.18808843195438385,
+      "learning_rate": 0.00013902372091031856,
+      "loss": 0.6327,
+      "step": 1175
+    },
+    {
+      "epoch": 1.0169491525423728,
+      "grad_norm": 0.12700864672660828,
+      "learning_rate": 0.00013848161830110395,
+      "loss": 0.6166,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0381355932203389,
+      "grad_norm": 0.17502394318580627,
+      "learning_rate": 0.0001379275600349625,
+      "loss": 0.542,
+      "step": 1225
+    },
+    {
+      "epoch": 1.0593220338983051,
+      "grad_norm": 0.17643524706363678,
+      "learning_rate": 0.0001373616504591167,
+      "loss": 0.6077,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0805084745762712,
+      "grad_norm": 0.21401630342006683,
+      "learning_rate": 0.00013678399615277674,
+      "loss": 0.5856,
+      "step": 1275
+    },
+    {
+      "epoch": 1.1016949152542372,
+      "grad_norm": 0.1577410101890564,
+      "learning_rate": 0.00013619470590706814,
+      "loss": 0.5882,
+      "step": 1300
+    },
+    {
+      "epoch": 1.1228813559322033,
+      "grad_norm": 0.2284272313117981,
+      "learning_rate": 0.00013559389070454304,
+      "loss": 0.5842,
+      "step": 1325
+    },
+    {
+      "epoch": 1.1440677966101696,
+      "grad_norm": 0.2204512506723404,
+      "learning_rate": 0.00013498166369827833,
+      "loss": 0.5911,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1652542372881356,
+      "grad_norm": 0.21209457516670227,
+      "learning_rate": 0.00013435814019056535,
+      "loss": 0.602,
+      "step": 1375
+    },
+    {
+      "epoch": 1.1864406779661016,
+      "grad_norm": 0.16774219274520874,
+      "learning_rate": 0.00013372343761119466,
+      "loss": 0.5746,
+      "step": 1400
+    },
+    {
+      "epoch": 1.207627118644068,
+      "grad_norm": 0.23171478509902954,
+      "learning_rate": 0.00013307767549534033,
+      "loss": 0.6046,
+      "step": 1425
+    },
+    {
+      "epoch": 1.228813559322034,
+      "grad_norm": 0.17449446022510529,
+      "learning_rate": 0.00013242097546104734,
+      "loss": 0.5969,
+      "step": 1450
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.21454857289791107,
+      "learning_rate": 0.00013175346118632713,
+      "loss": 0.5927,
+      "step": 1475
+    },
+    {
+      "epoch": 1.271186440677966,
+      "grad_norm": 0.17533324658870697,
+      "learning_rate": 0.00013107525838586495,
+      "loss": 0.5806,
+      "step": 1500
+    },
+    {
+      "epoch": 1.292372881355932,
+      "grad_norm": 0.2303514927625656,
+      "learning_rate": 0.00013038649478734363,
+      "loss": 0.6269,
+      "step": 1525
+    },
+    {
+      "epoch": 1.3135593220338984,
+      "grad_norm": 0.2209363877773285,
+      "learning_rate": 0.00012968730010738837,
+      "loss": 0.5699,
+      "step": 1550
+    },
+    {
+      "epoch": 1.3347457627118644,
+      "grad_norm": 0.2777274250984192,
+      "learning_rate": 0.0001289778060271368,
+      "loss": 0.5583,
+      "step": 1575
+    },
+    {
+      "epoch": 1.3559322033898304,
+      "grad_norm": 0.19397616386413574,
+      "learning_rate": 0.00012825814616743928,
+      "loss": 0.5785,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3771186440677967,
+      "grad_norm": 0.24071291089057922,
+      "learning_rate": 0.0001275284560636935,
+      "loss": 0.5793,
+      "step": 1625
+    },
+    {
+      "epoch": 1.3983050847457628,
+      "grad_norm": 0.16364933550357819,
+      "learning_rate": 0.000126788873140319,
+      "loss": 0.5591,
+      "step": 1650
+    },
+    {
+      "epoch": 1.4194915254237288,
+      "grad_norm": 0.2222534716129303,
+      "learning_rate": 0.00012603953668487546,
+      "loss": 0.5649,
+      "step": 1675
+    },
+    {
+      "epoch": 1.4406779661016949,
+      "grad_norm": 0.18990883231163025,
+      "learning_rate": 0.00012528058782183048,
+      "loss": 0.5732,
+      "step": 1700
+    },
+    {
+      "epoch": 1.461864406779661,
+      "grad_norm": 0.23255659639835358,
+      "learning_rate": 0.00012451216948598117,
+      "loss": 0.55,
+      "step": 1725
+    },
+    {
+      "epoch": 1.4830508474576272,
+      "grad_norm": 0.19624237716197968,
+      "learning_rate": 0.00012373442639553487,
+      "loss": 0.5793,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5042372881355932,
+      "grad_norm": 0.24238888919353485,
+      "learning_rate": 0.00012294750502485398,
+      "loss": 0.5823,
+      "step": 1775
+    },
+    {
+      "epoch": 1.5254237288135593,
+      "grad_norm": 0.2002212405204773,
+      "learning_rate": 0.00012215155357687017,
+      "loss": 0.571,
+      "step": 1800
+    },
+    {
+      "epoch": 1.5466101694915255,
+      "grad_norm": 0.21096192300319672,
+      "learning_rate": 0.0001213467219551728,
+      "loss": 0.588,
+      "step": 1825
+    },
+    {
+      "epoch": 1.5677966101694916,
+      "grad_norm": 0.20380620658397675,
+      "learning_rate": 0.00012053316173577726,
+      "loss": 0.5869,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5889830508474576,
+      "grad_norm": 0.25443893671035767,
+      "learning_rate": 0.00011971102613857823,
+      "loss": 0.5659,
+      "step": 1875
+    },
+    {
+      "epoch": 1.6101694915254239,
+      "grad_norm": 0.22190341353416443,
+      "learning_rate": 0.0001188804699984935,
+      "loss": 0.5835,
+      "step": 1900
+    },
+    {
+      "epoch": 1.6313559322033897,
+      "grad_norm": 0.24329130351543427,
+      "learning_rate": 0.00011804164973630335,
+      "loss": 0.5639,
+      "step": 1925
+    },
+    {
+      "epoch": 1.652542372881356,
+      "grad_norm": 0.2349741905927658,
+      "learning_rate": 0.00011719472332919148,
+      "loss": 0.5726,
+      "step": 1950
+    },
+    {
+      "epoch": 1.673728813559322,
+      "grad_norm": 0.20963279902935028,
+      "learning_rate": 0.00011633985028099284,
+      "loss": 0.5612,
+      "step": 1975
+    },
+    {
+      "epoch": 1.694915254237288,
+      "grad_norm": 0.27600300312042236,
+      "learning_rate": 0.00011547719159215378,
+      "loss": 0.5943,
+      "step": 2000
+    },
+    {
+      "epoch": 1.7161016949152543,
+      "grad_norm": 0.21020427346229553,
+      "learning_rate": 0.00011460690972941037,
+      "loss": 0.5802,
+      "step": 2025
+    },
+    {
+      "epoch": 1.7372881355932204,
+      "grad_norm": 0.20670145750045776,
+      "learning_rate": 0.00011372916859519075,
+      "loss": 0.5766,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7584745762711864,
+      "grad_norm": 0.2435368299484253,
+      "learning_rate": 0.0001128441334967469,
+      "loss": 0.6128,
+      "step": 2075
+    },
+    {
+      "epoch": 1.7796610169491527,
+      "grad_norm": 0.21429473161697388,
+      "learning_rate": 0.00011195197111502184,
+      "loss": 0.5844,
+      "step": 2100
+    },
+    {
+      "epoch": 1.8008474576271185,
+      "grad_norm": 0.21995683014392853,
+      "learning_rate": 0.0001110528494732583,
+      "loss": 0.5532,
+      "step": 2125
+    },
+    {
+      "epoch": 1.8220338983050848,
+      "grad_norm": 0.19685518741607666,
+      "learning_rate": 0.00011014693790535437,
+      "loss": 0.5569,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8432203389830508,
+      "grad_norm": 0.20260564982891083,
+      "learning_rate": 0.00010923440702397243,
+      "loss": 0.5792,
+      "step": 2175
+    },
+    {
+      "epoch": 1.8644067796610169,
+      "grad_norm": 0.19778716564178467,
+      "learning_rate": 0.00010831542868840729,
+      "loss": 0.5978,
+      "step": 2200
+    },
+    {
+      "epoch": 1.8855932203389831,
+      "grad_norm": 0.22923052310943604,
+      "learning_rate": 0.00010739017597221942,
+      "loss": 0.5572,
+      "step": 2225
+    },
+    {
+      "epoch": 1.9067796610169492,
+      "grad_norm": 0.21343784034252167,
+      "learning_rate": 0.00010645882313063953,
+      "loss": 0.5643,
+      "step": 2250
+    },
+    {
+      "epoch": 1.9279661016949152,
+      "grad_norm": 0.2053421288728714,
+      "learning_rate": 0.00010552154556775076,
+      "loss": 0.5806,
+      "step": 2275
+    },
+    {
+      "epoch": 1.9491525423728815,
+      "grad_norm": 0.22164656221866608,
+      "learning_rate": 0.00010457851980345423,
+      "loss": 0.6011,
+      "step": 2300
+    },
+    {
+      "epoch": 1.9703389830508473,
+      "grad_norm": 0.284758985042572,
+      "learning_rate": 0.00010362992344022468,
+      "loss": 0.5374,
+      "step": 2325
+    },
+    {
+      "epoch": 1.9915254237288136,
+      "grad_norm": 0.2642022371292114,
+      "learning_rate": 0.00010267593512966216,
+      "loss": 0.5892,
+      "step": 2350
+    },
+    {
+      "epoch": 2.01271186440678,
+      "grad_norm": 0.19165368378162384,
+      "learning_rate": 0.00010171673453884601,
+      "loss": 0.5175,
+      "step": 2375
+    },
+    {
+      "epoch": 2.0338983050847457,
+      "grad_norm": 0.2643072307109833,
+      "learning_rate": 0.00010075250231649775,
+      "loss": 0.5204,
+      "step": 2400
+    },
+    {
+      "epoch": 2.055084745762712,
+      "grad_norm": 0.2326943427324295,
+      "learning_rate": 9.978342005895911e-05,
+      "loss": 0.4847,
+      "step": 2425
+    },
+    {
+      "epoch": 2.0762711864406778,
+      "grad_norm": 0.2779608368873596,
+      "learning_rate": 9.880967027599139e-05,
+      "loss": 0.52,
+      "step": 2450
+    },
+    {
+      "epoch": 2.097457627118644,
+      "grad_norm": 0.22342316806316376,
+      "learning_rate": 9.783143635640304e-05,
+      "loss": 0.5124,
+      "step": 2475
+    },
+    {
+      "epoch": 2.1186440677966103,
+      "grad_norm": 0.26453691720962524,
+      "learning_rate": 9.684890253351153e-05,
+      "loss": 0.4954,
+      "step": 2500
+    },
+    {
+      "epoch": 2.139830508474576,
+      "grad_norm": 0.26683682203292847,
+      "learning_rate": 9.586225385044615e-05,
+      "loss": 0.519,
+      "step": 2525
+    },
+    {
+      "epoch": 2.1610169491525424,
+      "grad_norm": 0.27656257152557373,
+      "learning_rate": 9.487167612529851e-05,
+      "loss": 0.5409,
+      "step": 2550
+    },
+    {
+      "epoch": 2.1822033898305087,
+      "grad_norm": 0.27244171500205994,
+      "learning_rate": 9.387735591612677e-05,
+      "loss": 0.4976,
+      "step": 2575
+    },
+    {
+      "epoch": 2.2033898305084745,
+      "grad_norm": 0.29296210408210754,
+      "learning_rate": 9.28794804858208e-05,
+      "loss": 0.4964,
+      "step": 2600
+    },
+    {
+      "epoch": 2.2245762711864407,
+      "grad_norm": 0.28374531865119934,
+      "learning_rate": 9.187823776683444e-05,
+      "loss": 0.4936,
+      "step": 2625
+    },
+    {
+      "epoch": 2.2457627118644066,
+      "grad_norm": 0.25039607286453247,
+      "learning_rate": 9.087381632579165e-05,
+      "loss": 0.4548,
+      "step": 2650
+    },
+    {
+      "epoch": 2.266949152542373,
+      "grad_norm": 0.2839612662792206,
+      "learning_rate": 8.986640532797341e-05,
+      "loss": 0.521,
+      "step": 2675
+    },
+    {
+      "epoch": 2.288135593220339,
+      "grad_norm": 0.26817333698272705,
+      "learning_rate": 8.885619450169154e-05,
+      "loss": 0.4813,
+      "step": 2700
+    },
+    {
+      "epoch": 2.309322033898305,
+      "grad_norm": 0.2513103187084198,
+      "learning_rate": 8.78433741025568e-05,
+      "loss": 0.4964,
+      "step": 2725
+    },
+    {
+      "epoch": 2.330508474576271,
+      "grad_norm": 0.2661533057689667,
+      "learning_rate": 8.682813487764759e-05,
+      "loss": 0.5267,
+      "step": 2750
+    },
+    {
+      "epoch": 2.3516949152542375,
+      "grad_norm": 0.31996023654937744,
+      "learning_rate": 8.581066802958593e-05,
+      "loss": 0.4877,
+      "step": 2775
+    },
+    {
+      "epoch": 2.3728813559322033,
+      "grad_norm": 0.3120092749595642,
+      "learning_rate": 8.479116518052793e-05,
+      "loss": 0.5025,
+      "step": 2800
+    },
+    {
+      "epoch": 2.3940677966101696,
+      "grad_norm": 0.25984951853752136,
+      "learning_rate": 8.376981833607496e-05,
+      "loss": 0.5184,
+      "step": 2825
+    },
+    {
+      "epoch": 2.415254237288136,
+      "grad_norm": 0.28586438298225403,
+      "learning_rate": 8.274681984911279e-05,
+      "loss": 0.5128,
+      "step": 2850
+    },
+    {
+      "epoch": 2.4364406779661016,
+      "grad_norm": 0.23898103833198547,
+      "learning_rate": 8.172236238358537e-05,
+      "loss": 0.4968,
+      "step": 2875
+    },
+    {
+      "epoch": 2.457627118644068,
+      "grad_norm": 0.2596363127231598,
+      "learning_rate": 8.069663887820978e-05,
+      "loss": 0.5338,
+      "step": 2900
+    },
+    {
+      "epoch": 2.4788135593220337,
+      "grad_norm": 0.2569097578525543,
+      "learning_rate": 7.966984251013964e-05,
+      "loss": 0.5186,
+      "step": 2925
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.23606939613819122,
+      "learning_rate": 7.864216665858362e-05,
+      "loss": 0.5087,
+      "step": 2950
+    },
+    {
+      "epoch": 2.5211864406779663,
+      "grad_norm": 0.24160584807395935,
+      "learning_rate": 7.761380486838573e-05,
+      "loss": 0.5164,
+      "step": 2975
+    },
+    {
+      "epoch": 2.542372881355932,
+      "grad_norm": 0.3212146461009979,
+      "learning_rate": 7.658495081357461e-05,
+      "loss": 0.5173,
+      "step": 3000
+    },
+    {
+      "epoch": 2.5635593220338984,
+      "grad_norm": 0.22904744744300842,
+      "learning_rate": 7.555579826088837e-05,
+      "loss": 0.5345,
+      "step": 3025
+    },
+    {
+      "epoch": 2.584745762711864,
+      "grad_norm": 0.31355756521224976,
+      "learning_rate": 7.452654103328196e-05,
+      "loss": 0.4683,
+      "step": 3050
+    },
+    {
+      "epoch": 2.6059322033898304,
+      "grad_norm": 0.31533321738243103,
+      "learning_rate": 7.349737297342404e-05,
+      "loss": 0.5259,
+      "step": 3075
+    },
+    {
+      "epoch": 2.6271186440677967,
+      "grad_norm": 0.2956444025039673,
+      "learning_rate": 7.24684879071901e-05,
+      "loss": 0.497,
+      "step": 3100
+    },
+    {
+      "epoch": 2.648305084745763,
+      "grad_norm": 0.2766103446483612,
+      "learning_rate": 7.14400796071587e-05,
+      "loss": 0.5166,
+      "step": 3125
+    },
+    {
+      "epoch": 2.669491525423729,
+      "grad_norm": 0.3354440927505493,
+      "learning_rate": 7.041234175611775e-05,
+      "loss": 0.5233,
+      "step": 3150
+    },
+    {
+      "epoch": 2.690677966101695,
+      "grad_norm": 0.2812809348106384,
+      "learning_rate": 6.938546791058785e-05,
+      "loss": 0.5155,
+      "step": 3175
+    },
+    {
+      "epoch": 2.711864406779661,
+      "grad_norm": 0.39217862486839294,
+      "learning_rate": 6.835965146436916e-05,
+      "loss": 0.4926,
+      "step": 3200
+    },
+    {
+      "epoch": 2.733050847457627,
+      "grad_norm": 0.3037302494049072,
+      "learning_rate": 6.73350856121191e-05,
+      "loss": 0.5098,
+      "step": 3225
+    },
+    {
+      "epoch": 2.7542372881355934,
+      "grad_norm": 0.2784561514854431,
+      "learning_rate": 6.63119633129675e-05,
+      "loss": 0.5371,
+      "step": 3250
+    },
+    {
+      "epoch": 2.7754237288135593,
+      "grad_norm": 0.2815192639827728,
+      "learning_rate": 6.529047725417618e-05,
+      "loss": 0.4839,
+      "step": 3275
+    },
+    {
+      "epoch": 2.7966101694915255,
+      "grad_norm": 0.26870056986808777,
+      "learning_rate": 6.427081981484946e-05,
+      "loss": 0.4981,
+      "step": 3300
+    },
+    {
+      "epoch": 2.8177966101694913,
+      "grad_norm": 0.28585174679756165,
+      "learning_rate": 6.325318302970318e-05,
+      "loss": 0.4841,
+      "step": 3325
+    },
+    {
+      "epoch": 2.8389830508474576,
+      "grad_norm": 0.2712132930755615,
+      "learning_rate": 6.22377585528981e-05,
+      "loss": 0.4833,
+      "step": 3350
+    },
+    {
+      "epoch": 2.860169491525424,
+      "grad_norm": 0.28583309054374695,
+      "learning_rate": 6.12247376219452e-05,
+      "loss": 0.5043,
+      "step": 3375
+    },
+    {
+      "epoch": 2.8813559322033897,
+      "grad_norm": 0.29179123044013977,
+      "learning_rate": 6.021431102168954e-05,
+      "loss": 0.5343,
+      "step": 3400
+    },
+    {
+      "epoch": 2.902542372881356,
+      "grad_norm": 0.29638585448265076,
+      "learning_rate": 5.92066690483792e-05,
+      "loss": 0.501,
+      "step": 3425
+    },
+    {
+      "epoch": 2.923728813559322,
+      "grad_norm": 0.2945152521133423,
+      "learning_rate": 5.820200147382617e-05,
+      "loss": 0.5149,
+      "step": 3450
+    },
+    {
+      "epoch": 2.944915254237288,
+      "grad_norm": 0.24451757967472076,
+      "learning_rate": 5.720049750966638e-05,
+      "loss": 0.501,
+      "step": 3475
+    },
+    {
+      "epoch": 2.9661016949152543,
+      "grad_norm": 0.33959802985191345,
+      "learning_rate": 5.6202345771724785e-05,
+      "loss": 0.5202,
+      "step": 3500
+    },
+    {
+      "epoch": 2.9872881355932206,
+      "grad_norm": 0.40264537930488586,
+      "learning_rate": 5.520773424449299e-05,
+      "loss": 0.5004,
+      "step": 3525
+    },
+    {
+      "epoch": 3.0084745762711864,
+      "grad_norm": 0.23446495831012726,
+      "learning_rate": 5.421685024572547e-05,
+      "loss": 0.4788,
+      "step": 3550
+    },
+    {
+      "epoch": 3.0296610169491527,
+      "grad_norm": 0.29302000999450684,
+      "learning_rate": 5.322988039116176e-05,
+      "loss": 0.4302,
+      "step": 3575
+    },
+    {
+      "epoch": 3.0508474576271185,
+      "grad_norm": 0.28345516324043274,
+      "learning_rate": 5.224701055938047e-05,
+      "loss": 0.4195,
+      "step": 3600
+    },
+    {
+      "epoch": 3.0720338983050848,
+      "grad_norm": 0.3563604950904846,
+      "learning_rate": 5.126842585679235e-05,
+      "loss": 0.4302,
+      "step": 3625
+    },
+    {
+      "epoch": 3.093220338983051,
+      "grad_norm": 0.2989650070667267,
+      "learning_rate": 5.0294310582778717e-05,
+      "loss": 0.4082,
+      "step": 3650
+    },
+    {
+      "epoch": 3.114406779661017,
+      "grad_norm": 0.3035448491573334,
+      "learning_rate": 4.9324848194981906e-05,
+      "loss": 0.4294,
+      "step": 3675
+    },
+    {
+      "epoch": 3.135593220338983,
+      "grad_norm": 0.3060661256313324,
+      "learning_rate": 4.83602212747541e-05,
+      "loss": 0.4243,
+      "step": 3700
+    },
+    {
+      "epoch": 3.156779661016949,
+      "grad_norm": 0.3512302041053772,
+      "learning_rate": 4.7400611492771505e-05,
+      "loss": 0.4558,
+      "step": 3725
+    },
+    {
+      "epoch": 3.1779661016949152,
+      "grad_norm": 0.3085233271121979,
+      "learning_rate": 4.644619957481972e-05,
+      "loss": 0.4405,
+      "step": 3750
+    },
+    {
+      "epoch": 3.1991525423728815,
+      "grad_norm": 0.37406814098358154,
+      "learning_rate": 4.549716526775711e-05,
+      "loss": 0.4394,
+      "step": 3775
+    },
+    {
+      "epoch": 3.2203389830508473,
+      "grad_norm": 0.28444594144821167,
+      "learning_rate": 4.455368730566282e-05,
+      "loss": 0.4356,
+      "step": 3800
+    },
+    {
+      "epoch": 3.2415254237288136,
+      "grad_norm": 0.3252512812614441,
+      "learning_rate": 4.361594337617518e-05,
+      "loss": 0.4422,
+      "step": 3825
+    },
+    {
+      "epoch": 3.26271186440678,
+      "grad_norm": 0.34911468625068665,
+      "learning_rate": 4.2684110087027364e-05,
+      "loss": 0.42,
+      "step": 3850
+    },
+    {
+      "epoch": 3.2838983050847457,
+      "grad_norm": 0.31359365582466125,
+      "learning_rate": 4.175836293278635e-05,
+      "loss": 0.4229,
+      "step": 3875
+    },
+    {
+      "epoch": 3.305084745762712,
+      "grad_norm": 0.332359254360199,
+      "learning_rate": 4.083887626180175e-05,
+      "loss": 0.4428,
+      "step": 3900
+    },
+    {
+      "epoch": 3.326271186440678,
+      "grad_norm": 0.3841429054737091,
+      "learning_rate": 3.992582324337009e-05,
+      "loss": 0.4643,
+      "step": 3925
+    },
+    {
+      "epoch": 3.347457627118644,
+      "grad_norm": 0.3356688618659973,
+      "learning_rate": 3.901937583512158e-05,
+      "loss": 0.4169,
+      "step": 3950
+    },
+    {
+      "epoch": 3.3686440677966103,
+      "grad_norm": 0.39436978101730347,
+      "learning_rate": 3.811970475063486e-05,
+      "loss": 0.4564,
+      "step": 3975
+    },
+    {
+      "epoch": 3.389830508474576,
+      "grad_norm": 0.29478755593299866,
+      "learning_rate": 3.7226979427285943e-05,
+      "loss": 0.3858,
+      "step": 4000
+    },
+    {
+      "epoch": 3.4110169491525424,
+      "grad_norm": 0.4711458086967468,
+      "learning_rate": 3.6341367994337784e-05,
+      "loss": 0.4547,
+      "step": 4025
+    },
+    {
+      "epoch": 3.4322033898305087,
+      "grad_norm": 0.38489460945129395,
+      "learning_rate": 3.546303724127603e-05,
+      "loss": 0.4235,
+      "step": 4050
+    },
+    {
+      "epoch": 3.4533898305084745,
+      "grad_norm": 0.41311007738113403,
+      "learning_rate": 3.459215258639708e-05,
+      "loss": 0.4589,
+      "step": 4075
+    },
+    {
+      "epoch": 3.4745762711864407,
+      "grad_norm": 0.3139210641384125,
+      "learning_rate": 3.372887804565442e-05,
+      "loss": 0.4163,
+      "step": 4100
+    },
+    {
+      "epoch": 3.4957627118644066,
+      "grad_norm": 0.43436604738235474,
+      "learning_rate": 3.2873376201769154e-05,
+      "loss": 0.4465,
+      "step": 4125
+    },
+    {
+      "epoch": 3.516949152542373,
+      "grad_norm": 0.37427470088005066,
+      "learning_rate": 3.202580817361037e-05,
+      "loss": 0.4106,
+      "step": 4150
+    },
+    {
+      "epoch": 3.538135593220339,
+      "grad_norm": 0.3729758560657501,
+      "learning_rate": 3.1186333585851056e-05,
+      "loss": 0.47,
+      "step": 4175
+    },
+    {
+      "epoch": 3.559322033898305,
+      "grad_norm": 0.3862791955471039,
+      "learning_rate": 3.0355110538905815e-05,
+      "loss": 0.3975,
+      "step": 4200
+    },
+    {
+      "epoch": 3.580508474576271,
+      "grad_norm": 0.35095420479774475,
+      "learning_rate": 2.953229557915525e-05,
+      "loss": 0.4422,
+      "step": 4225
+    },
+    {
+      "epoch": 3.601694915254237,
+      "grad_norm": 0.34636810421943665,
+      "learning_rate": 2.871804366946315e-05,
+      "loss": 0.428,
+      "step": 4250
+    },
+    {
+      "epoch": 3.6228813559322033,
+      "grad_norm": 0.3737597167491913,
+      "learning_rate": 2.791250815999207e-05,
+      "loss": 0.4544,
+      "step": 4275
+    },
+    {
+      "epoch": 3.6440677966101696,
+      "grad_norm": 0.3554207384586334,
+      "learning_rate": 2.7115840759322436e-05,
+      "loss": 0.4167,
+      "step": 4300
+    },
+    {
+      "epoch": 3.665254237288136,
+      "grad_norm": 0.369305819272995,
+      "learning_rate": 2.6359522461221096e-05,
+      "loss": 0.4456,
+      "step": 4325
+    },
+    {
+      "epoch": 3.6864406779661016,
+      "grad_norm": 0.40377670526504517,
+      "learning_rate": 2.5580670208969884e-05,
+      "loss": 0.4465,
+      "step": 4350
+    },
+    {
+      "epoch": 3.707627118644068,
+      "grad_norm": 0.4016803801059723,
+      "learning_rate": 2.4811125226576454e-05,
+      "loss": 0.4395,
+      "step": 4375
+    },
+    {
+      "epoch": 3.7288135593220337,
+      "grad_norm": 0.3124406337738037,
+      "learning_rate": 2.405103244443235e-05,
+      "loss": 0.4154,
+      "step": 4400
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.44163626432418823,
+      "learning_rate": 2.330053501277194e-05,
+      "loss": 0.4607,
+      "step": 4425
+    },
+    {
+      "epoch": 3.7711864406779663,
+      "grad_norm": 0.33251988887786865,
+      "learning_rate": 2.2559774274712466e-05,
+      "loss": 0.4114,
+      "step": 4450
+    },
+    {
+      "epoch": 3.792372881355932,
+      "grad_norm": 0.4052109718322754,
+      "learning_rate": 2.1828889739634496e-05,
+      "loss": 0.4123,
+      "step": 4475
+    },
+    {
+      "epoch": 3.8135593220338984,
+      "grad_norm": 0.3507472276687622,
+      "learning_rate": 2.110801905690787e-05,
+      "loss": 0.4199,
+      "step": 4500
+    },
+    {
+      "epoch": 3.834745762711864,
+      "grad_norm": 0.4040756821632385,
+      "learning_rate": 2.03972979899678e-05,
+      "loss": 0.4526,
+      "step": 4525
+    },
+    {
+      "epoch": 3.8559322033898304,
+      "grad_norm": 0.30861154198646545,
+      "learning_rate": 1.9696860390746082e-05,
+      "loss": 0.4152,
+      "step": 4550
+    },
+    {
+      "epoch": 3.8771186440677967,
+      "grad_norm": 0.4708113670349121,
+      "learning_rate": 1.900683817446263e-05,
+      "loss": 0.4477,
+      "step": 4575
+    },
+    {
+      "epoch": 3.898305084745763,
+      "grad_norm": 0.3677612543106079,
+      "learning_rate": 1.832736129478131e-05,
+      "loss": 0.4279,
+      "step": 4600
+    },
+    {
+      "epoch": 3.919491525423729,
+      "grad_norm": 0.3834724724292755,
+      "learning_rate": 1.7658557719335652e-05,
+      "loss": 0.4235,
+      "step": 4625
+    },
+    {
+      "epoch": 3.940677966101695,
+      "grad_norm": 0.3320079445838928,
+      "learning_rate": 1.7000553405628164e-05,
+      "loss": 0.4103,
+      "step": 4650
+    },
+    {
+      "epoch": 3.961864406779661,
+      "grad_norm": 0.4474587142467499,
+      "learning_rate": 1.6353472277308618e-05,
+      "loss": 0.4422,
+      "step": 4675
+    },
+    {
+      "epoch": 3.983050847457627,
+      "grad_norm": 0.3154617249965668,
+      "learning_rate": 1.571743620083504e-05,
+      "loss": 0.4343,
+      "step": 4700
+    },
+    {
+      "epoch": 4.004237288135593,
+      "grad_norm": 0.32371950149536133,
+      "learning_rate": 1.5092564962522388e-05,
+      "loss": 0.452,
+      "step": 4725
+    },
+    {
+      "epoch": 4.02542372881356,
+      "grad_norm": 0.3051236867904663,
+      "learning_rate": 1.447897624598286e-05,
+      "loss": 0.4164,
+      "step": 4750
+    },
+    {
+      "epoch": 4.046610169491525,
+      "grad_norm": 0.3187614679336548,
+      "learning_rate": 1.3876785609962218e-05,
+      "loss": 0.3446,
+      "step": 4775
+    },
+    {
+      "epoch": 4.067796610169491,
+      "grad_norm": 0.48843175172805786,
+      "learning_rate": 1.3286106466576264e-05,
+      "loss": 0.4296,
+      "step": 4800
+    },
+    {
+      "epoch": 4.088983050847458,
+      "grad_norm": 0.3983837068080902,
+      "learning_rate": 1.2707050059951763e-05,
+      "loss": 0.344,
+      "step": 4825
+    },
+    {
+      "epoch": 4.110169491525424,
+      "grad_norm": 0.300611674785614,
+      "learning_rate": 1.2139725445275481e-05,
+      "loss": 0.4169,
+      "step": 4850
+    },
+    {
+      "epoch": 4.13135593220339,
+      "grad_norm": 0.4292912781238556,
+      "learning_rate": 1.158423946825549e-05,
+      "loss": 0.3689,
+      "step": 4875
+    },
+    {
+      "epoch": 4.1525423728813555,
+      "grad_norm": 0.3964712917804718,
+      "learning_rate": 1.1040696744998754e-05,
+      "loss": 0.4404,
+      "step": 4900
+    },
+    {
+      "epoch": 4.173728813559322,
+      "grad_norm": 0.6776478886604309,
+      "learning_rate": 1.0509199642308436e-05,
+      "loss": 0.3979,
+      "step": 4925
+    },
+    {
+      "epoch": 4.194915254237288,
+      "grad_norm": 0.396267831325531,
+      "learning_rate": 9.98984825840486e-06,
+      "loss": 0.4182,
+      "step": 4950
+    },
+    {
+      "epoch": 4.216101694915254,
+      "grad_norm": 0.28718650341033936,
+      "learning_rate": 9.482740404073851e-06,
+      "loss": 0.3736,
+      "step": 4975
+    },
+    {
+      "epoch": 4.237288135593221,
+      "grad_norm": 0.3323756158351898,
+      "learning_rate": 8.987971584245729e-06,
+      "loss": 0.4113,
+      "step": 5000
+    },
+    {
+      "epoch": 4.258474576271187,
+      "grad_norm": 0.33957767486572266,
+      "learning_rate": 8.50563498000856e-06,
+      "loss": 0.3925,
+      "step": 5025
+    },
+    {
+      "epoch": 4.279661016949152,
+      "grad_norm": 0.4178178906440735,
+      "learning_rate": 8.035821431059244e-06,
+      "loss": 0.3973,
+      "step": 5050
+    },
+    {
+      "epoch": 4.3008474576271185,
+      "grad_norm": 0.3192192614078522,
+      "learning_rate": 7.578619418595358e-06,
+      "loss": 0.3605,
+      "step": 5075
+    },
+    {
+      "epoch": 4.322033898305085,
+      "grad_norm": 0.4187626540660858,
+      "learning_rate": 7.1341150486512374e-06,
+      "loss": 0.4199,
+      "step": 5100
+    },
+    {
+      "epoch": 4.343220338983051,
+      "grad_norm": 0.3863602578639984,
+      "learning_rate": 6.702392035881507e-06,
+      "loss": 0.3568,
+      "step": 5125
+    },
+    {
+      "epoch": 4.364406779661017,
+      "grad_norm": 0.4073178172111511,
+      "learning_rate": 6.28353168779481e-06,
+      "loss": 0.4327,
+      "step": 5150
+    },
+    {
+      "epoch": 4.385593220338983,
+      "grad_norm": 0.31056177616119385,
+      "learning_rate": 5.8776128894409305e-06,
+      "loss": 0.372,
+      "step": 5175
+    },
+    {
+      "epoch": 4.406779661016949,
+      "grad_norm": 0.3671024739742279,
+      "learning_rate": 5.484712088554253e-06,
+      "loss": 0.4078,
+      "step": 5200
+    },
+    {
+      "epoch": 4.427966101694915,
+      "grad_norm": 0.2966119349002838,
+      "learning_rate": 5.1049032811561196e-06,
+      "loss": 0.3529,
+      "step": 5225
+    },
+    {
+      "epoch": 4.4491525423728815,
+      "grad_norm": 0.3545999526977539,
+      "learning_rate": 4.7382579976189244e-06,
+      "loss": 0.3864,
+      "step": 5250
+    },
+    {
+      "epoch": 4.470338983050848,
+      "grad_norm": 0.3902367651462555,
+      "learning_rate": 4.384845289194699e-06,
+      "loss": 0.3434,
+      "step": 5275
+    },
+    {
+      "epoch": 4.491525423728813,
+      "grad_norm": 0.4561343193054199,
+      "learning_rate": 4.044731715010463e-06,
+      "loss": 0.371,
+      "step": 5300
+    },
+    {
+      "epoch": 4.512711864406779,
+      "grad_norm": 0.29569247364997864,
+      "learning_rate": 3.717981329532979e-06,
+      "loss": 0.3957,
+      "step": 5325
+    },
+    {
+      "epoch": 4.533898305084746,
+      "grad_norm": 0.3961041271686554,
+      "learning_rate": 3.4046556705051744e-06,
+      "loss": 0.3938,
+      "step": 5350
+    },
+    {
+      "epoch": 4.555084745762712,
+      "grad_norm": 0.35009700059890747,
+      "learning_rate": 3.104813747356674e-06,
+      "loss": 0.3829,
+      "step": 5375
+    },
+    {
+      "epoch": 4.576271186440678,
+      "grad_norm": 0.404491662979126,
+      "learning_rate": 2.8185120300902865e-06,
+      "loss": 0.3916,
+      "step": 5400
+    },
+    {
+      "epoch": 4.597457627118644,
+      "grad_norm": 0.3277469277381897,
+      "learning_rate": 2.5458044386469727e-06,
+      "loss": 0.3681,
+      "step": 5425
+    },
+    {
+      "epoch": 4.61864406779661,
+      "grad_norm": 0.4005700349807739,
+      "learning_rate": 2.2867423327508654e-06,
+      "loss": 0.4249,
+      "step": 5450
+    },
+    {
+      "epoch": 4.639830508474576,
+      "grad_norm": 0.30087345838546753,
+      "learning_rate": 2.0413745022366285e-06,
+      "loss": 0.3493,
+      "step": 5475
+    },
+    {
+      "epoch": 4.661016949152542,
+      "grad_norm": 0.37881365418434143,
+      "learning_rate": 1.8097471578607164e-06,
+      "loss": 0.4209,
+      "step": 5500
+    },
+    {
+      "epoch": 4.682203389830509,
+      "grad_norm": 0.40850409865379333,
+      "learning_rate": 1.5919039225983782e-06,
+      "loss": 0.378,
+      "step": 5525
+    },
+    {
+      "epoch": 4.703389830508475,
+      "grad_norm": 0.43627145886421204,
+      "learning_rate": 1.3878858234280532e-06,
+      "loss": 0.4131,
+      "step": 5550
+    },
+    {
+      "epoch": 4.72457627118644,
+      "grad_norm": 0.3629254400730133,
+      "learning_rate": 1.1977312836046194e-06,
+      "loss": 0.3555,
+      "step": 5575
+    },
+    {
+      "epoch": 4.745762711864407,
+      "grad_norm": 0.4231952428817749,
+      "learning_rate": 1.0214761154230643e-06,
+      "loss": 0.4459,
+      "step": 5600
+    },
+    {
+      "epoch": 4.766949152542373,
+      "grad_norm": 0.32848870754241943,
+      "learning_rate": 8.591535134738814e-07,
+      "loss": 0.3753,
+      "step": 5625
+    },
+    {
+      "epoch": 4.788135593220339,
+      "grad_norm": 0.49593624472618103,
+      "learning_rate": 7.107940483913943e-07,
+      "loss": 0.4109,
+      "step": 5650
+    },
+    {
+      "epoch": 4.809322033898305,
+      "grad_norm": 0.3230677545070648,
+      "learning_rate": 5.764256610963636e-07,
+      "loss": 0.3534,
+      "step": 5675
+    },
+    {
+      "epoch": 4.830508474576272,
+      "grad_norm": 0.4409547746181488,
+      "learning_rate": 4.560736575337787e-07,
+      "loss": 0.4389,
+      "step": 5700
+    },
+    {
+      "epoch": 4.851694915254237,
+      "grad_norm": 0.3816058039665222,
+      "learning_rate": 3.4976070390692054e-07,
+      "loss": 0.369,
+      "step": 5725
+    },
+    {
+      "epoch": 4.872881355932203,
+      "grad_norm": 0.3902296721935272,
+      "learning_rate": 2.5750682240857634e-07,
+      "loss": 0.4134,
+      "step": 5750
+    },
+    {
+      "epoch": 4.8940677966101696,
+      "grad_norm": 0.3721590042114258,
+      "learning_rate": 1.7932938745022218e-07,
+      "loss": 0.3509,
+      "step": 5775
+    },
+    {
+      "epoch": 4.915254237288136,
+      "grad_norm": 0.3812599778175354,
+      "learning_rate": 1.1524312238984923e-07,
+      "loss": 0.4109,
+      "step": 5800
+    },
+    {
+      "epoch": 4.936440677966102,
+      "grad_norm": 0.39883601665496826,
+      "learning_rate": 6.526009675905663e-08,
+      "loss": 0.3768,
+      "step": 5825
+    },
+    {
+      "epoch": 4.9576271186440675,
+      "grad_norm": 0.38190439343452454,
+      "learning_rate": 2.9389723990011495e-08,
+      "loss": 0.4262,
+      "step": 5850
+    },
+    {
+      "epoch": 4.978813559322034,
+      "grad_norm": 0.2927350699901581,
+      "learning_rate": 7.638759642525361e-09,
+      "loss": 0.3631,
+      "step": 5875
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.7745693922042847,
+      "learning_rate": 1.1300131838587468e-11,
+      "loss": 0.3756,
+      "step": 5900
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 5900,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0720875463474176e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

Dubs/v0.0.1/P_C/checkpoint-5900/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6faca1b9fd727ce19a4e50f007c131e02d78148e09461d970f397278ecf3ffa0
+size 5624

Dubs/v0.0.1/P_C/runs/Dec12_06-02-48_a2e074eee72a/events.out.tfevents.1733983589.a2e074eee72a.7056.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc809b22a58fd5ed2cd448d73ace6135eb68c85cac9786108f0bee1ce456c60e
+size 8602

Dubs/v0.0.1/P_C/runs/Dec12_06-02-48_a2e074eee72a/events.out.tfevents.1733983648.a2e074eee72a.7056.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:474e4628ebf07e405bd4c646d488271f97507e54a7150168f6e473caced75dd9
+size 8280

Dubs/v0.0.1/P_C/runs/Dec12_06-10-33_a2e074eee72a/events.out.tfevents.1733983892.a2e074eee72a.9377.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d55dded40238a95f193004b7b02be734fe04b087948d91c9578a64474a73adcf
+size 8602

Dubs/v0.0.1/P_C/runs/Dec12_06-17-12_a2e074eee72a/events.out.tfevents.1733984297.a2e074eee72a.10985.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74c99d63f04f50f9a020b8b965285ab6681d001187a157d62055a84d165c3c90
+size 58732

Dubs/v0.0.1/adapter/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: microsoft/Phi-3-mini-128k-instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

Dubs/v0.0.1/adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "self_attn.qkv_proj",
+    "self_attn.o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Dubs/v0.0.1/adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6d33dd5e4c11ffdccca0885ca97ceacf5efdaf9a3c1afbdf1a4fc2d2c43813c
+size 151014592

Dubs/v0.0.1/adapter/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

Dubs/v0.0.1/adapter/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Dubs/v0.0.1/adapter/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Dubs/v0.0.1/adapter/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

Dubs/v0.0.1/adapter/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

Dubs/v0.0.1/full_model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

Dubs/v0.0.1/full_model/config.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "_name_or_path": "Dubs_0.0.1",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "description": "Finetuned on RayBernard/leetcode",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "phi3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 32000,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1.0700000524520874,
+      1.1200000047683716,
+      1.149999976158142,
+      1.4199999570846558,
+      1.5699999332427979,
+      1.7999999523162842,
+      2.129999876022339,
+      2.129999876022339,
+      3.009999990463257,
+      5.910000324249268,
+      6.950000286102295,
+      9.070000648498535,
+      9.930000305175781,
+      10.710000038146973,
+      11.130000114440918,
+      14.609999656677246,
+      15.409998893737793,
+      19.809999465942383,
+      37.279998779296875,
+      38.279998779296875,
+      38.599998474121094,
+      40.12000274658203,
+      46.20000457763672,
+      50.940006256103516,
+      53.66000747680664,
+      54.9373893737793,
+      56.89738845825195,
+      57.28738784790039,
+      59.98738479614258,
+      60.86738586425781,
+      60.887386322021484,
+      61.71739196777344,
+      62.91739273071289,
+      62.957393646240234,
+      63.41739273071289,
+      63.8173942565918,
+      63.83739471435547,
+      63.897396087646484,
+      63.93739700317383,
+      64.06739807128906,
+      64.11434936523438,
+      64.12435150146484,
+      64.15435028076172,
+      64.19435119628906,
+      64.24435424804688,
+      64.57435607910156,
+      64.69000244140625,
+      64.76000213623047
+    ],
+    "short_factor": [
+      1.1,
+      1.1,
+      1.1,
+      1.3000000000000003,
+      1.3500000000000003,
+      1.3500000000000003,
+      1.4000000000000004,
+      1.5500000000000005,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.1000000000000005,
+      2.1000000000000005,
+      2.1500000000000004,
+      2.25,
+      2.25,
+      2.25,
+      2.25,
+      2.25,
+      2.3999999999999995,
+      2.4499999999999993,
+      2.499999999999999,
+      2.6999999999999984,
+      2.6999999999999984,
+      2.7499999999999982,
+      2.799999999999998,
+      2.8999999999999977,
+      3.049999999999997
+    ],
+    "type": "longrope"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 262144,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 32064
+}

Dubs/v0.0.1/full_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": [
+    32000,
+    32001,
+    32007
+  ],
+  "pad_token_id": 32000,
+  "transformers_version": "4.47.0"
+}

Dubs/v0.0.1/full_model/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b31a3ce5fa4205069b877b3fa1fddb062da5ab32ae148eddfa0796079be8da
+size 4972489200

Dubs/v0.0.1/full_model/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4134ffe1adc5710e27927489ff6021862bef37f388f1d96210a66b6240142cad
+size 2669692488

Dubs/v0.0.1/full_model/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+  "metadata": {
+    "total_size": 7642159104
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

Dubs/v0.0.1/full_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Dubs/v0.0.1/full_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Dubs/v0.0.1/full_model/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

Dubs/v0.0.1/full_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}