sengi commited on
Commit
3073f9b
1 Parent(s): 69eb809

Model save

Browse files
README.md CHANGED
@@ -2,17 +2,13 @@
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - alignment-handbook
12
  - generated_from_trainer
13
  datasets:
14
- - HuggingFaceH4/ultrachat_200k
15
- base_model: mistralai/Mistral-7B-v0.1
16
  model-index:
17
  - name: zephyr-7b-pl-qlora
18
  results: []
@@ -23,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # zephyr-7b-pl-qlora
25
 
26
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
27
 
28
  ## Model description
29
 
@@ -47,8 +43,10 @@ The following hyperparameters were used during training:
47
  - eval_batch_size: 4
48
  - seed: 42
49
  - distributed_type: multi-GPU
50
- - gradient_accumulation_steps: 4
51
- - total_train_batch_size: 8
 
 
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
 
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - alignment-handbook
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
11
+ base_model: alignment-handbook/zephyr-7b-sft-full
12
  model-index:
13
  - name: zephyr-7b-pl-qlora
14
  results: []
 
19
 
20
  # zephyr-7b-pl-qlora
21
 
22
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the generator dataset.
23
 
24
  ## Model description
25
 
 
43
  - eval_batch_size: 4
44
  - seed: 42
45
  - distributed_type: multi-GPU
46
+ - num_devices: 4
47
+ - gradient_accumulation_steps: 2
48
+ - total_train_batch_size: 16
49
+ - total_eval_batch_size: 16
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.1
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 0.01,
3
- "train_loss": 0.6359813857078552,
4
- "train_runtime": 2120.9698,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 0.377,
7
- "train_steps_per_second": 0.047
8
  }
 
1
  {
2
  "epoch": 0.01,
3
+ "train_loss": 0.6498598456382751,
4
+ "train_runtime": 1109.6734,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 1.442,
7
+ "train_steps_per_second": 0.09
8
  }
lora_0/adapter_config.json CHANGED
@@ -4,7 +4,7 @@
4
  "base_model_class": "MistralForCausalLM",
5
  "parent_library": "transformers.models.mistral.modeling_mistral"
6
  },
7
- "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
8
  "bias": "none",
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "gate_proj",
27
- "up_proj",
28
  "q_proj",
29
- "v_proj",
30
  "o_proj",
31
  "k_proj",
32
- "down_proj"
 
 
33
  ],
34
  "task_type": null,
35
  "use_dora": false,
 
4
  "base_model_class": "MistralForCausalLM",
5
  "parent_library": "transformers.models.mistral.modeling_mistral"
6
  },
7
+ "base_model_name_or_path": "alignment-handbook/zephyr-7b-sft-full",
8
  "bias": "none",
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "q_proj",
27
+ "gate_proj",
28
  "o_proj",
29
  "k_proj",
30
+ "v_proj",
31
+ "down_proj",
32
+ "up_proj"
33
  ],
34
  "task_type": null,
35
  "use_dora": false,
lora_0/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d95350f5041ab4e9166125ad0900a94e5edbb0d39d4fdfef11b30e664c3a0d10
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be25e89d7bcc96fe7078f3f362f8458402b5e2ff4e9c4bd3da4a4348ce5264a7
3
  size 167832240
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 0.01,
3
- "train_loss": 0.6359813857078552,
4
- "train_runtime": 2120.9698,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 0.377,
7
- "train_steps_per_second": 0.047
8
  }
 
1
  {
2
  "epoch": 0.01,
3
+ "train_loss": 0.6498598456382751,
4
+ "train_runtime": 1109.6734,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 1.442,
7
+ "train_steps_per_second": 0.09
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.005737810738312797,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
@@ -23,125 +23,125 @@
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 5e-07,
26
- "loss": 0.6921,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 4.96201938253052e-07,
32
- "loss": 0.6886,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 4.849231551964771e-07,
38
- "loss": 0.6827,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 4.6650635094610966e-07,
44
- "loss": 0.6745,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.0,
49
  "learning_rate": 4.415111107797445e-07,
50
- "loss": 0.6645,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 4.106969024216348e-07,
56
- "loss": 0.6549,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.0,
61
  "learning_rate": 3.75e-07,
62
- "loss": 0.6463,
63
  "step": 40
64
  },
65
  {
66
- "epoch": 0.0,
67
  "learning_rate": 3.355050358314172e-07,
68
- "loss": 0.6395,
69
  "step": 45
70
  },
71
  {
72
- "epoch": 0.0,
73
  "learning_rate": 2.934120444167326e-07,
74
- "loss": 0.6305,
75
  "step": 50
76
  },
77
  {
78
- "epoch": 0.0,
79
  "learning_rate": 2.5e-07,
80
- "loss": 0.6241,
81
  "step": 55
82
  },
83
  {
84
- "epoch": 0.0,
85
  "learning_rate": 2.065879555832674e-07,
86
- "loss": 0.6155,
87
  "step": 60
88
  },
89
  {
90
- "epoch": 0.0,
91
  "learning_rate": 1.6449496416858282e-07,
92
- "loss": 0.6147,
93
  "step": 65
94
  },
95
  {
96
- "epoch": 0.0,
97
  "learning_rate": 1.2500000000000005e-07,
98
- "loss": 0.6042,
99
  "step": 70
100
  },
101
  {
102
- "epoch": 0.0,
103
  "learning_rate": 8.930309757836516e-08,
104
- "loss": 0.6041,
105
  "step": 75
106
  },
107
  {
108
- "epoch": 0.0,
109
  "learning_rate": 5.848888922025552e-08,
110
- "loss": 0.5989,
111
  "step": 80
112
  },
113
  {
114
- "epoch": 0.0,
115
  "learning_rate": 3.349364905389032e-08,
116
- "loss": 0.5988,
117
  "step": 85
118
  },
119
  {
120
  "epoch": 0.01,
121
  "learning_rate": 1.507684480352292e-08,
122
- "loss": 0.5961,
123
  "step": 90
124
  },
125
  {
126
  "epoch": 0.01,
127
  "learning_rate": 3.798061746947995e-09,
128
- "loss": 0.5984,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 0.01,
133
  "learning_rate": 0.0,
134
- "loss": 0.5982,
135
  "step": 100
136
  },
137
  {
138
  "epoch": 0.01,
139
  "step": 100,
140
- "total_flos": 7.072526927868723e+16,
141
- "train_loss": 0.6359813857078552,
142
- "train_runtime": 2120.9698,
143
- "train_samples_per_second": 0.377,
144
- "train_steps_per_second": 0.047
145
  }
146
  ],
147
  "logging_steps": 5,
@@ -149,7 +149,7 @@
149
  "num_input_tokens_seen": 0,
150
  "num_train_epochs": 1,
151
  "save_steps": 100,
152
- "total_flos": 7.072526927868723e+16,
153
  "train_batch_size": 2,
154
  "trial_name": null,
155
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.011475127660795226,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
 
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 5e-07,
26
+ "loss": 0.6922,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 4.96201938253052e-07,
32
+ "loss": 0.689,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 4.849231551964771e-07,
38
+ "loss": 0.6841,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 4.6650635094610966e-07,
44
+ "loss": 0.6785,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.0,
49
  "learning_rate": 4.415111107797445e-07,
50
+ "loss": 0.6716,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 4.106969024216348e-07,
56
+ "loss": 0.6645,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.0,
61
  "learning_rate": 3.75e-07,
62
+ "loss": 0.6583,
63
  "step": 40
64
  },
65
  {
66
+ "epoch": 0.01,
67
  "learning_rate": 3.355050358314172e-07,
68
+ "loss": 0.6513,
69
  "step": 45
70
  },
71
  {
72
+ "epoch": 0.01,
73
  "learning_rate": 2.934120444167326e-07,
74
+ "loss": 0.6454,
75
  "step": 50
76
  },
77
  {
78
+ "epoch": 0.01,
79
  "learning_rate": 2.5e-07,
80
+ "loss": 0.6396,
81
  "step": 55
82
  },
83
  {
84
+ "epoch": 0.01,
85
  "learning_rate": 2.065879555832674e-07,
86
+ "loss": 0.635,
87
  "step": 60
88
  },
89
  {
90
+ "epoch": 0.01,
91
  "learning_rate": 1.6449496416858282e-07,
92
+ "loss": 0.6302,
93
  "step": 65
94
  },
95
  {
96
+ "epoch": 0.01,
97
  "learning_rate": 1.2500000000000005e-07,
98
+ "loss": 0.6281,
99
  "step": 70
100
  },
101
  {
102
+ "epoch": 0.01,
103
  "learning_rate": 8.930309757836516e-08,
104
+ "loss": 0.6258,
105
  "step": 75
106
  },
107
  {
108
+ "epoch": 0.01,
109
  "learning_rate": 5.848888922025552e-08,
110
+ "loss": 0.6249,
111
  "step": 80
112
  },
113
  {
114
+ "epoch": 0.01,
115
  "learning_rate": 3.349364905389032e-08,
116
+ "loss": 0.6218,
117
  "step": 85
118
  },
119
  {
120
  "epoch": 0.01,
121
  "learning_rate": 1.507684480352292e-08,
122
+ "loss": 0.6206,
123
  "step": 90
124
  },
125
  {
126
  "epoch": 0.01,
127
  "learning_rate": 3.798061746947995e-09,
128
+ "loss": 0.6223,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 0.01,
133
  "learning_rate": 0.0,
134
+ "loss": 0.6206,
135
  "step": 100
136
  },
137
  {
138
  "epoch": 0.01,
139
  "step": 100,
140
+ "total_flos": 1.4145053855737446e+17,
141
+ "train_loss": 0.6498598456382751,
142
+ "train_runtime": 1109.6734,
143
+ "train_samples_per_second": 1.442,
144
+ "train_steps_per_second": 0.09
145
  }
146
  ],
147
  "logging_steps": 5,
 
149
  "num_input_tokens_seen": 0,
150
  "num_train_epochs": 1,
151
  "save_steps": 100,
152
+ "total_flos": 1.4145053855737446e+17,
153
  "train_batch_size": 2,
154
  "trial_name": null,
155
  "trial_params": null