Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
README.md +202 -3
adapter_config.json +29 -0
adapter_model.safetensors +3 -0
config.json +37 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +5 -0
tokenizer.json +3 -0
tokenizer_config.json +14 -0
trainer_state.json +866 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,202 @@
----
-license: cc-by-nc-4.0
----

+---
+base_model: meta-llama/Llama-3.2-1B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.2-1B",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75583563cbafded86923ccc7c068135ff1f3b00014672deee75222ff4043ec3d
+size 6824216

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "meta-llama/Llama-3.2-1B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.2",
+  "use_cache": true,
+  "vocab_size": 128256
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ae0b21ad040c4836eccac554da257d33180517cc8e529a000e3dd6aa767202f
+size 13685562

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2c3a21d70cec36bfa6f82dfbb5af4831d56920036f06fbdac17bef3430d9874
+size 14308

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:390cb479ce37ac976fe37dde53c0dff6bde9d7414b0dcbd7914816f957383670
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "pad_token": "<|end_of_text|>",
+  "eos_token": "<|end_of_text|>",
+  "unk_token": null
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "vocab_file": null,
+  "name_or_path": "meta-llama/Llama-3.2-1B",
+  "padding_side": "right",
+  "pad_token": "<|end_of_text|>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,866 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997407311381903,
+  "eval_steps": 50,
+  "global_step": 964,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.010370754472387866,
+      "grad_norm": 2.4068312644958496,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 4.4724,
+      "step": 10
+    },
+    {
+      "epoch": 0.020741508944775732,
+      "grad_norm": 2.3241941928863525,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 4.5137,
+      "step": 20
+    },
+    {
+      "epoch": 0.0311122634171636,
+      "grad_norm": 2.4529693126678467,
+      "learning_rate": 1.5e-06,
+      "loss": 4.431,
+      "step": 30
+    },
+    {
+      "epoch": 0.041483017889551464,
+      "grad_norm": 2.5506527423858643,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 4.4615,
+      "step": 40
+    },
+    {
+      "epoch": 0.05185377236193933,
+      "grad_norm": 2.6286089420318604,
+      "learning_rate": 2.5e-06,
+      "loss": 4.4173,
+      "step": 50
+    },
+    {
+      "epoch": 0.05185377236193933,
+      "eval_loss": 4.529317855834961,
+      "eval_runtime": 43.0684,
+      "eval_samples_per_second": 79.594,
+      "eval_steps_per_second": 9.961,
+      "step": 50
+    },
+    {
+      "epoch": 0.0622245268343272,
+      "grad_norm": 2.2027931213378906,
+      "learning_rate": 3e-06,
+      "loss": 4.3936,
+      "step": 60
+    },
+    {
+      "epoch": 0.07259528130671507,
+      "grad_norm": 2.632085084915161,
+      "learning_rate": 3.5e-06,
+      "loss": 4.4038,
+      "step": 70
+    },
+    {
+      "epoch": 0.08296603577910293,
+      "grad_norm": 2.330366849899292,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 4.3844,
+      "step": 80
+    },
+    {
+      "epoch": 0.09333679025149079,
+      "grad_norm": 2.4520134925842285,
+      "learning_rate": 4.5e-06,
+      "loss": 4.3325,
+      "step": 90
+    },
+    {
+      "epoch": 0.10370754472387866,
+      "grad_norm": 2.727679491043091,
+      "learning_rate": 5e-06,
+      "loss": 4.2768,
+      "step": 100
+    },
+    {
+      "epoch": 0.10370754472387866,
+      "eval_loss": 4.328857898712158,
+      "eval_runtime": 43.0263,
+      "eval_samples_per_second": 79.672,
+      "eval_steps_per_second": 9.971,
+      "step": 100
+    },
+    {
+      "epoch": 0.11407829919626652,
+      "grad_norm": 2.3905959129333496,
+      "learning_rate": 5.500000000000001e-06,
+      "loss": 4.1575,
+      "step": 110
+    },
+    {
+      "epoch": 0.1244490536686544,
+      "grad_norm": 2.3810746669769287,
+      "learning_rate": 6e-06,
+      "loss": 4.1188,
+      "step": 120
+    },
+    {
+      "epoch": 0.13481980814104227,
+      "grad_norm": 2.3154499530792236,
+      "learning_rate": 6.5000000000000004e-06,
+      "loss": 4.0751,
+      "step": 130
+    },
+    {
+      "epoch": 0.14519056261343014,
+      "grad_norm": 2.404163360595703,
+      "learning_rate": 7e-06,
+      "loss": 3.9433,
+      "step": 140
+    },
+    {
+      "epoch": 0.155561317085818,
+      "grad_norm": 2.620729446411133,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 3.9579,
+      "step": 150
+    },
+    {
+      "epoch": 0.155561317085818,
+      "eval_loss": 3.87788724899292,
+      "eval_runtime": 43.1648,
+      "eval_samples_per_second": 79.417,
+      "eval_steps_per_second": 9.939,
+      "step": 150
+    },
+    {
+      "epoch": 0.16593207155820586,
+      "grad_norm": 2.6772756576538086,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 3.7662,
+      "step": 160
+    },
+    {
+      "epoch": 0.17630282603059372,
+      "grad_norm": 2.6104724407196045,
+      "learning_rate": 8.5e-06,
+      "loss": 3.6483,
+      "step": 170
+    },
+    {
+      "epoch": 0.18667358050298158,
+      "grad_norm": 2.636183023452759,
+      "learning_rate": 9e-06,
+      "loss": 3.4924,
+      "step": 180
+    },
+    {
+      "epoch": 0.19704433497536947,
+      "grad_norm": 2.9193673133850098,
+      "learning_rate": 9.5e-06,
+      "loss": 3.33,
+      "step": 190
+    },
+    {
+      "epoch": 0.20741508944775733,
+      "grad_norm": 2.378948926925659,
+      "learning_rate": 1e-05,
+      "loss": 3.1667,
+      "step": 200
+    },
+    {
+      "epoch": 0.20741508944775733,
+      "eval_loss": 3.0792782306671143,
+      "eval_runtime": 43.0269,
+      "eval_samples_per_second": 79.671,
+      "eval_steps_per_second": 9.971,
+      "step": 200
+    },
+    {
+      "epoch": 0.2177858439201452,
+      "grad_norm": 3.6606717109680176,
+      "learning_rate": 9.86910994764398e-06,
+      "loss": 2.9038,
+      "step": 210
+    },
+    {
+      "epoch": 0.22815659839253305,
+      "grad_norm": 4.638175964355469,
+      "learning_rate": 9.73821989528796e-06,
+      "loss": 2.7723,
+      "step": 220
+    },
+    {
+      "epoch": 0.2385273528649209,
+      "grad_norm": 5.681021690368652,
+      "learning_rate": 9.607329842931939e-06,
+      "loss": 2.4375,
+      "step": 230
+    },
+    {
+      "epoch": 0.2488981073373088,
+      "grad_norm": 3.9302401542663574,
+      "learning_rate": 9.476439790575916e-06,
+      "loss": 2.2828,
+      "step": 240
+    },
+    {
+      "epoch": 0.25926886180969666,
+      "grad_norm": 3.4748728275299072,
+      "learning_rate": 9.345549738219896e-06,
+      "loss": 2.1372,
+      "step": 250
+    },
+    {
+      "epoch": 0.25926886180969666,
+      "eval_loss": 2.027852773666382,
+      "eval_runtime": 42.9733,
+      "eval_samples_per_second": 79.77,
+      "eval_steps_per_second": 9.983,
+      "step": 250
+    },
+    {
+      "epoch": 0.26963961628208455,
+      "grad_norm": 2.240591049194336,
+      "learning_rate": 9.214659685863875e-06,
+      "loss": 2.0402,
+      "step": 260
+    },
+    {
+      "epoch": 0.2800103707544724,
+      "grad_norm": 2.12862229347229,
+      "learning_rate": 9.083769633507853e-06,
+      "loss": 1.8311,
+      "step": 270
+    },
+    {
+      "epoch": 0.29038112522686027,
+      "grad_norm": 1.9156771898269653,
+      "learning_rate": 8.952879581151834e-06,
+      "loss": 1.7948,
+      "step": 280
+    },
+    {
+      "epoch": 0.3007518796992481,
+      "grad_norm": 1.2717920541763306,
+      "learning_rate": 8.821989528795813e-06,
+      "loss": 1.7745,
+      "step": 290
+    },
+    {
+      "epoch": 0.311122634171636,
+      "grad_norm": 0.9778507947921753,
+      "learning_rate": 8.691099476439791e-06,
+      "loss": 1.6066,
+      "step": 300
+    },
+    {
+      "epoch": 0.311122634171636,
+      "eval_loss": 1.6197232007980347,
+      "eval_runtime": 43.0004,
+      "eval_samples_per_second": 79.72,
+      "eval_steps_per_second": 9.977,
+      "step": 300
+    },
+    {
+      "epoch": 0.3214933886440238,
+      "grad_norm": 0.966334879398346,
+      "learning_rate": 8.56020942408377e-06,
+      "loss": 1.6183,
+      "step": 310
+    },
+    {
+      "epoch": 0.3318641431164117,
+      "grad_norm": 0.8336134552955627,
+      "learning_rate": 8.429319371727749e-06,
+      "loss": 1.543,
+      "step": 320
+    },
+    {
+      "epoch": 0.3422348975887996,
+      "grad_norm": 0.7293752431869507,
+      "learning_rate": 8.298429319371727e-06,
+      "loss": 1.5888,
+      "step": 330
+    },
+    {
+      "epoch": 0.35260565206118744,
+      "grad_norm": 0.7492266297340393,
+      "learning_rate": 8.167539267015708e-06,
+      "loss": 1.5612,
+      "step": 340
+    },
+    {
+      "epoch": 0.3629764065335753,
+      "grad_norm": 0.8373680710792542,
+      "learning_rate": 8.036649214659686e-06,
+      "loss": 1.547,
+      "step": 350
+    },
+    {
+      "epoch": 0.3629764065335753,
+      "eval_loss": 1.571603536605835,
+      "eval_runtime": 43.1239,
+      "eval_samples_per_second": 79.492,
+      "eval_steps_per_second": 9.948,
+      "step": 350
+    },
+    {
+      "epoch": 0.37334716100596316,
+      "grad_norm": 0.9682691097259521,
+      "learning_rate": 7.905759162303665e-06,
+      "loss": 1.6005,
+      "step": 360
+    },
+    {
+      "epoch": 0.38371791547835105,
+      "grad_norm": 0.6970401406288147,
+      "learning_rate": 7.774869109947646e-06,
+      "loss": 1.6102,
+      "step": 370
+    },
+    {
+      "epoch": 0.39408866995073893,
+      "grad_norm": 0.8149111866950989,
+      "learning_rate": 7.643979057591624e-06,
+      "loss": 1.5331,
+      "step": 380
+    },
+    {
+      "epoch": 0.40445942442312677,
+      "grad_norm": 0.6417681574821472,
+      "learning_rate": 7.513089005235603e-06,
+      "loss": 1.5559,
+      "step": 390
+    },
+    {
+      "epoch": 0.41483017889551466,
+      "grad_norm": 0.669866144657135,
+      "learning_rate": 7.382198952879581e-06,
+      "loss": 1.6237,
+      "step": 400
+    },
+    {
+      "epoch": 0.41483017889551466,
+      "eval_loss": 1.5569473505020142,
+      "eval_runtime": 43.0517,
+      "eval_samples_per_second": 79.625,
+      "eval_steps_per_second": 9.965,
+      "step": 400
+    },
+    {
+      "epoch": 0.4252009333679025,
+      "grad_norm": 0.7108224630355835,
+      "learning_rate": 7.25130890052356e-06,
+      "loss": 1.5205,
+      "step": 410
+    },
+    {
+      "epoch": 0.4355716878402904,
+      "grad_norm": 0.772306501865387,
+      "learning_rate": 7.12041884816754e-06,
+      "loss": 1.4833,
+      "step": 420
+    },
+    {
+      "epoch": 0.44594244231267827,
+      "grad_norm": 0.8170768618583679,
+      "learning_rate": 6.989528795811519e-06,
+      "loss": 1.506,
+      "step": 430
+    },
+    {
+      "epoch": 0.4563131967850661,
+      "grad_norm": 0.7127036452293396,
+      "learning_rate": 6.858638743455498e-06,
+      "loss": 1.642,
+      "step": 440
+    },
+    {
+      "epoch": 0.466683951257454,
+      "grad_norm": 1.1019853353500366,
+      "learning_rate": 6.727748691099477e-06,
+      "loss": 1.5815,
+      "step": 450
+    },
+    {
+      "epoch": 0.466683951257454,
+      "eval_loss": 1.5491901636123657,
+      "eval_runtime": 42.974,
+      "eval_samples_per_second": 79.769,
+      "eval_steps_per_second": 9.983,
+      "step": 450
+    },
+    {
+      "epoch": 0.4770547057298418,
+      "grad_norm": 0.7836682200431824,
+      "learning_rate": 6.5968586387434565e-06,
+      "loss": 1.479,
+      "step": 460
+    },
+    {
+      "epoch": 0.4874254602022297,
+      "grad_norm": 0.8299842476844788,
+      "learning_rate": 6.465968586387435e-06,
+      "loss": 1.4768,
+      "step": 470
+    },
+    {
+      "epoch": 0.4977962146746176,
+      "grad_norm": 0.7423719763755798,
+      "learning_rate": 6.335078534031414e-06,
+      "loss": 1.5919,
+      "step": 480
+    },
+    {
+      "epoch": 0.5081669691470054,
+      "grad_norm": 0.7347830533981323,
+      "learning_rate": 6.204188481675393e-06,
+      "loss": 1.4697,
+      "step": 490
+    },
+    {
+      "epoch": 0.5185377236193933,
+      "grad_norm": 0.8458806276321411,
+      "learning_rate": 6.073298429319372e-06,
+      "loss": 1.5822,
+      "step": 500
+    },
+    {
+      "epoch": 0.5185377236193933,
+      "eval_loss": 1.5439085960388184,
+      "eval_runtime": 43.1032,
+      "eval_samples_per_second": 79.53,
+      "eval_steps_per_second": 9.953,
+      "step": 500
+    },
+    {
+      "epoch": 0.5289084780917812,
+      "grad_norm": 0.8292895555496216,
+      "learning_rate": 5.942408376963351e-06,
+      "loss": 1.5543,
+      "step": 510
+    },
+    {
+      "epoch": 0.5392792325641691,
+      "grad_norm": 0.7892965078353882,
+      "learning_rate": 5.81151832460733e-06,
+      "loss": 1.6241,
+      "step": 520
+    },
+    {
+      "epoch": 0.5496499870365569,
+      "grad_norm": 0.8499513268470764,
+      "learning_rate": 5.680628272251309e-06,
+      "loss": 1.4915,
+      "step": 530
+    },
+    {
+      "epoch": 0.5600207415089448,
+      "grad_norm": 0.8531098365783691,
+      "learning_rate": 5.549738219895289e-06,
+      "loss": 1.5094,
+      "step": 540
+    },
+    {
+      "epoch": 0.5703914959813327,
+      "grad_norm": 0.7012779116630554,
+      "learning_rate": 5.418848167539268e-06,
+      "loss": 1.5539,
+      "step": 550
+    },
+    {
+      "epoch": 0.5703914959813327,
+      "eval_loss": 1.5399216413497925,
+      "eval_runtime": 43.067,
+      "eval_samples_per_second": 79.597,
+      "eval_steps_per_second": 9.961,
+      "step": 550
+    },
+    {
+      "epoch": 0.5807622504537205,
+      "grad_norm": 0.7626951336860657,
+      "learning_rate": 5.287958115183246e-06,
+      "loss": 1.5038,
+      "step": 560
+    },
+    {
+      "epoch": 0.5911330049261084,
+      "grad_norm": 0.8458223938941956,
+      "learning_rate": 5.157068062827225e-06,
+      "loss": 1.5217,
+      "step": 570
+    },
+    {
+      "epoch": 0.6015037593984962,
+      "grad_norm": 0.8810559511184692,
+      "learning_rate": 5.026178010471204e-06,
+      "loss": 1.6896,
+      "step": 580
+    },
+    {
+      "epoch": 0.6118745138708841,
+      "grad_norm": 0.9249419569969177,
+      "learning_rate": 4.895287958115184e-06,
+      "loss": 1.5184,
+      "step": 590
+    },
+    {
+      "epoch": 0.622245268343272,
+      "grad_norm": 0.7158748507499695,
+      "learning_rate": 4.764397905759163e-06,
+      "loss": 1.5405,
+      "step": 600
+    },
+    {
+      "epoch": 0.622245268343272,
+      "eval_loss": 1.5371109247207642,
+      "eval_runtime": 43.1456,
+      "eval_samples_per_second": 79.452,
+      "eval_steps_per_second": 9.943,
+      "step": 600
+    },
+    {
+      "epoch": 0.6326160228156599,
+      "grad_norm": 0.8123712539672852,
+      "learning_rate": 4.633507853403142e-06,
+      "loss": 1.4703,
+      "step": 610
+    },
+    {
+      "epoch": 0.6429867772880477,
+      "grad_norm": 0.8977182507514954,
+      "learning_rate": 4.502617801047121e-06,
+      "loss": 1.5568,
+      "step": 620
+    },
+    {
+      "epoch": 0.6533575317604355,
+      "grad_norm": 0.8391156792640686,
+      "learning_rate": 4.3717277486910996e-06,
+      "loss": 1.5993,
+      "step": 630
+    },
+    {
+      "epoch": 0.6637282862328234,
+      "grad_norm": 0.7252123355865479,
+      "learning_rate": 4.240837696335079e-06,
+      "loss": 1.5162,
+      "step": 640
+    },
+    {
+      "epoch": 0.6740990407052113,
+      "grad_norm": 0.7567150592803955,
+      "learning_rate": 4.109947643979058e-06,
+      "loss": 1.5821,
+      "step": 650
+    },
+    {
+      "epoch": 0.6740990407052113,
+      "eval_loss": 1.5346648693084717,
+      "eval_runtime": 43.0456,
+      "eval_samples_per_second": 79.637,
+      "eval_steps_per_second": 9.966,
+      "step": 650
+    },
+    {
+      "epoch": 0.6844697951775992,
+      "grad_norm": 0.6526748538017273,
+      "learning_rate": 3.9790575916230365e-06,
+      "loss": 1.5429,
+      "step": 660
+    },
+    {
+      "epoch": 0.694840549649987,
+      "grad_norm": 0.7770061492919922,
+      "learning_rate": 3.848167539267016e-06,
+      "loss": 1.497,
+      "step": 670
+    },
+    {
+      "epoch": 0.7052113041223749,
+      "grad_norm": 0.6573889255523682,
+      "learning_rate": 3.717277486910995e-06,
+      "loss": 1.6247,
+      "step": 680
+    },
+    {
+      "epoch": 0.7155820585947628,
+      "grad_norm": 0.9382066130638123,
+      "learning_rate": 3.5863874345549743e-06,
+      "loss": 1.5577,
+      "step": 690
+    },
+    {
+      "epoch": 0.7259528130671506,
+      "grad_norm": 0.9911208748817444,
+      "learning_rate": 3.455497382198953e-06,
+      "loss": 1.4734,
+      "step": 700
+    },
+    {
+      "epoch": 0.7259528130671506,
+      "eval_loss": 1.5329481363296509,
+      "eval_runtime": 43.1201,
+      "eval_samples_per_second": 79.499,
+      "eval_steps_per_second": 9.949,
+      "step": 700
+    },
+    {
+      "epoch": 0.7363235675395385,
+      "grad_norm": 0.8948063850402832,
+      "learning_rate": 3.324607329842932e-06,
+      "loss": 1.5257,
+      "step": 710
+    },
+    {
+      "epoch": 0.7466943220119263,
+      "grad_norm": 1.0471000671386719,
+      "learning_rate": 3.1937172774869113e-06,
+      "loss": 1.5289,
+      "step": 720
+    },
+    {
+      "epoch": 0.7570650764843142,
+      "grad_norm": 0.7089968323707581,
+      "learning_rate": 3.0628272251308904e-06,
+      "loss": 1.5721,
+      "step": 730
+    },
+    {
+      "epoch": 0.7674358309567021,
+      "grad_norm": 0.9314925074577332,
+      "learning_rate": 2.931937172774869e-06,
+      "loss": 1.4879,
+      "step": 740
+    },
+    {
+      "epoch": 0.77780658542909,
+      "grad_norm": 0.8222401142120361,
+      "learning_rate": 2.8010471204188483e-06,
+      "loss": 1.5909,
+      "step": 750
+    },
+    {
+      "epoch": 0.77780658542909,
+      "eval_loss": 1.5315285921096802,
+      "eval_runtime": 43.1263,
+      "eval_samples_per_second": 79.487,
+      "eval_steps_per_second": 9.948,
+      "step": 750
+    },
+    {
+      "epoch": 0.7881773399014779,
+      "grad_norm": 0.7002791166305542,
+      "learning_rate": 2.6701570680628274e-06,
+      "loss": 1.5853,
+      "step": 760
+    },
+    {
+      "epoch": 0.7985480943738656,
+      "grad_norm": 0.7302571535110474,
+      "learning_rate": 2.5392670157068065e-06,
+      "loss": 1.4632,
+      "step": 770
+    },
+    {
+      "epoch": 0.8089188488462535,
+      "grad_norm": 0.785142719745636,
+      "learning_rate": 2.4083769633507856e-06,
+      "loss": 1.505,
+      "step": 780
+    },
+    {
+      "epoch": 0.8192896033186414,
+      "grad_norm": 0.6490882039070129,
+      "learning_rate": 2.2774869109947643e-06,
+      "loss": 1.4813,
+      "step": 790
+    },
+    {
+      "epoch": 0.8296603577910293,
+      "grad_norm": 0.7147834897041321,
+      "learning_rate": 2.1465968586387435e-06,
+      "loss": 1.4852,
+      "step": 800
+    },
+    {
+      "epoch": 0.8296603577910293,
+      "eval_loss": 1.5305155515670776,
+      "eval_runtime": 43.0366,
+      "eval_samples_per_second": 79.653,
+      "eval_steps_per_second": 9.968,
+      "step": 800
+    },
+    {
+      "epoch": 0.8400311122634172,
+      "grad_norm": 0.742734432220459,
+      "learning_rate": 2.0157068062827226e-06,
+      "loss": 1.4627,
+      "step": 810
+    },
+    {
+      "epoch": 0.850401866735805,
+      "grad_norm": 0.7220650315284729,
+      "learning_rate": 1.8848167539267017e-06,
+      "loss": 1.4692,
+      "step": 820
+    },
+    {
+      "epoch": 0.8607726212081929,
+      "grad_norm": 0.8684506416320801,
+      "learning_rate": 1.7539267015706806e-06,
+      "loss": 1.56,
+      "step": 830
+    },
+    {
+      "epoch": 0.8711433756805808,
+      "grad_norm": 0.7521070241928101,
+      "learning_rate": 1.6230366492146598e-06,
+      "loss": 1.5089,
+      "step": 840
+    },
+    {
+      "epoch": 0.8815141301529686,
+      "grad_norm": 0.9445785284042358,
+      "learning_rate": 1.4921465968586387e-06,
+      "loss": 1.6033,
+      "step": 850
+    },
+    {
+      "epoch": 0.8815141301529686,
+      "eval_loss": 1.5298349857330322,
+      "eval_runtime": 42.9802,
+      "eval_samples_per_second": 79.758,
+      "eval_steps_per_second": 9.981,
+      "step": 850
+    },
+    {
+      "epoch": 0.8918848846253565,
+      "grad_norm": 0.7844976186752319,
+      "learning_rate": 1.361256544502618e-06,
+      "loss": 1.5412,
+      "step": 860
+    },
+    {
+      "epoch": 0.9022556390977443,
+      "grad_norm": 0.9173896312713623,
+      "learning_rate": 1.230366492146597e-06,
+      "loss": 1.585,
+      "step": 870
+    },
+    {
+      "epoch": 0.9126263935701322,
+      "grad_norm": 0.7674463391304016,
+      "learning_rate": 1.099476439790576e-06,
+      "loss": 1.533,
+      "step": 880
+    },
+    {
+      "epoch": 0.9229971480425201,
+      "grad_norm": 0.901545524597168,
+      "learning_rate": 9.685863874345552e-07,
+      "loss": 1.6416,
+      "step": 890
+    },
+    {
+      "epoch": 0.933367902514908,
+      "grad_norm": 0.760588526725769,
+      "learning_rate": 8.376963350785341e-07,
+      "loss": 1.6217,
+      "step": 900
+    },
+    {
+      "epoch": 0.933367902514908,
+      "eval_loss": 1.529255747795105,
+      "eval_runtime": 42.9676,
+      "eval_samples_per_second": 79.781,
+      "eval_steps_per_second": 9.984,
+      "step": 900
+    },
+    {
+      "epoch": 0.9437386569872959,
+      "grad_norm": 0.780006468296051,
+      "learning_rate": 7.068062827225131e-07,
+      "loss": 1.5711,
+      "step": 910
+    },
+    {
+      "epoch": 0.9541094114596836,
+      "grad_norm": 0.6572290062904358,
+      "learning_rate": 5.759162303664922e-07,
+      "loss": 1.5525,
+      "step": 920
+    },
+    {
+      "epoch": 0.9644801659320715,
+      "grad_norm": 0.7653405666351318,
+      "learning_rate": 4.4502617801047125e-07,
+      "loss": 1.5585,
+      "step": 930
+    },
+    {
+      "epoch": 0.9748509204044594,
+      "grad_norm": 0.9417358636856079,
+      "learning_rate": 3.1413612565445027e-07,
+      "loss": 1.5995,
+      "step": 940
+    },
+    {
+      "epoch": 0.9852216748768473,
+      "grad_norm": 0.752137303352356,
+      "learning_rate": 1.8324607329842932e-07,
+      "loss": 1.6332,
+      "step": 950
+    },
+    {
+      "epoch": 0.9852216748768473,
+      "eval_loss": 1.5290166139602661,
+      "eval_runtime": 43.1488,
+      "eval_samples_per_second": 79.446,
+      "eval_steps_per_second": 9.942,
+      "step": 950
+    },
+    {
+      "epoch": 0.9955924293492352,
+      "grad_norm": 0.7827558517456055,
+      "learning_rate": 5.235602094240838e-08,
+      "loss": 1.5047,
+      "step": 960
+    },
+    {
+      "epoch": 0.9997407311381903,
+      "step": 964,
+      "total_flos": 9.238171939032269e+16,
+      "train_loss": 2.1392775007303326,
+      "train_runtime": 1823.6358,
+      "train_samples_per_second": 16.917,
+      "train_steps_per_second": 0.529
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 964,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.238171939032269e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:195382ac5713558e378690087fc69d5cef2015d44a6c0281a44175c5bed3e8cf
+size 5304