sengi commited on
Commit
a320ff5
1 Parent(s): 91b089b

Model save

Browse files
README.md CHANGED
@@ -1,16 +1,13 @@
1
  ---
 
2
  library_name: peft
3
  tags:
4
- - alignment-handbook
5
- - trl
6
- - sft
7
- - generated_from_trainer
8
  - trl
9
  - sft
10
  - alignment-handbook
11
  - generated_from_trainer
12
  datasets:
13
- - HuggingFaceH4/ultrachat_200k
14
  base_model: mistralai/Mistral-7B-v0.1
15
  model-index:
16
  - name: zephyr-7b-pl-qlora
@@ -22,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # zephyr-7b-pl-qlora
24
 
25
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
26
 
27
  ## Model description
28
 
@@ -46,9 +43,6 @@ The following hyperparameters were used during training:
46
  - eval_batch_size: 4
47
  - seed: 42
48
  - distributed_type: multi-GPU
49
- - num_devices: 4
50
- - total_train_batch_size: 8
51
- - total_eval_batch_size: 16
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
 
1
  ---
2
+ license: apache-2.0
3
  library_name: peft
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - alignment-handbook
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
11
  base_model: mistralai/Mistral-7B-v0.1
12
  model-index:
13
  - name: zephyr-7b-pl-qlora
 
19
 
20
  # zephyr-7b-pl-qlora
21
 
22
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
23
 
24
  ## Model description
25
 
 
43
  - eval_batch_size: 4
44
  - seed: 42
45
  - distributed_type: multi-GPU
 
 
 
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.01,
3
- "train_loss": 1.0116421318054198,
4
- "train_runtime": 473.1338,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 1.691,
7
- "train_steps_per_second": 0.211
8
  }
 
1
  {
2
+ "epoch": 0.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 501.823,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 0.399,
7
+ "train_steps_per_second": 0.199
8
  }
lora_4/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "MistralForCausalLM",
5
+ "parent_library": "transformers.models.mistral.modeling_mistral"
6
+ },
7
+ "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "o_proj",
27
+ "v_proj",
28
+ "k_proj",
29
+ "up_proj",
30
+ "q_proj",
31
+ "down_proj",
32
+ "gate_proj"
33
+ ],
34
+ "task_type": null,
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
lora_4/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:264fb80c3a7830c9942425211af6bdc69e6bc596276a815cc422cd6c999f2d33
3
+ size 167832240
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.01,
3
- "train_loss": 1.0116421318054198,
4
- "train_runtime": 473.1338,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 1.691,
7
- "train_steps_per_second": 0.211
8
  }
 
1
  {
2
+ "epoch": 0.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 501.823,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 0.399,
7
+ "train_steps_per_second": 0.199
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.005737563830397613,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
@@ -11,47 +11,47 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 2e-05,
14
- "loss": 1.2234,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
  "learning_rate": 0.00019396926207859084,
20
- "loss": 1.1179,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 0.00015000000000000001,
26
- "loss": 1.0056,
27
  "step": 40
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 8.263518223330697e-05,
32
- "loss": 0.9795,
33
  "step": 60
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 2.339555568810221e-05,
38
- "loss": 0.9954,
39
  "step": 80
40
  },
41
  {
42
- "epoch": 0.01,
43
  "learning_rate": 0.0,
44
- "loss": 0.9545,
45
  "step": 100
46
  },
47
  {
48
- "epoch": 0.01,
49
  "step": 100,
50
- "total_flos": 7.072526874181632e+16,
51
- "train_loss": 1.0116421318054198,
52
- "train_runtime": 473.1338,
53
- "train_samples_per_second": 1.691,
54
- "train_steps_per_second": 0.211
55
  }
56
  ],
57
  "logging_steps": 20,
@@ -59,7 +59,7 @@
59
  "num_input_tokens_seen": 0,
60
  "num_train_epochs": 1,
61
  "save_steps": 100,
62
- "total_flos": 7.072526874181632e+16,
63
  "train_batch_size": 2,
64
  "trial_name": null,
65
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0014344526845781992,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 2e-05,
14
+ "loss": 0.0,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
  "learning_rate": 0.00019396926207859084,
20
+ "loss": 0.0,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 0.00015000000000000001,
26
+ "loss": 0.0,
27
  "step": 40
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 8.263518223330697e-05,
32
+ "loss": 0.0,
33
  "step": 60
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 2.339555568810221e-05,
38
+ "loss": 0.0,
39
  "step": 80
40
  },
41
  {
42
+ "epoch": 0.0,
43
  "learning_rate": 0.0,
44
+ "loss": 0.0,
45
  "step": 100
46
  },
47
  {
48
+ "epoch": 0.0,
49
  "step": 100,
50
+ "total_flos": 1.768131718545408e+16,
51
+ "train_loss": 0.0,
52
+ "train_runtime": 501.823,
53
+ "train_samples_per_second": 0.399,
54
+ "train_steps_per_second": 0.199
55
  }
56
  ],
57
  "logging_steps": 20,
 
59
  "num_input_tokens_seen": 0,
60
  "num_train_epochs": 1,
61
  "save_steps": 100,
62
+ "total_flos": 1.768131718545408e+16,
63
  "train_batch_size": 2,
64
  "trial_name": null,
65
  "trial_params": null