ChenWu98 commited on
Commit
44be9ef
1 Parent(s): 71500d5

Model save

Browse files
Files changed (5) hide show
  1. README.md +4 -14
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +31 -187
README.md CHANGED
@@ -2,16 +2,9 @@
2
  license: mit
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
- - trl
10
- - sft
11
- - generated_from_trainer
12
- datasets:
13
- - ChenWu98/skills_metaphor_chat
14
- - ChenWu98/skills_red_herring_chat
15
  base_model: HuggingFaceH4/zephyr-7b-beta
16
  model-index:
17
  - name: skills_metaphor_chat-skills_red_herring_chat-lora
@@ -23,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # skills_metaphor_chat-skills_red_herring_chat-lora
25
 
26
- This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the ChenWu98/skills_metaphor_chat and the ChenWu98/skills_red_herring_chat datasets.
27
  It achieves the following results on the evaluation set:
28
- - Loss: 0.2436
29
 
30
  ## Model description
31
 
@@ -54,16 +47,13 @@ The following hyperparameters were used during training:
54
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
55
  - lr_scheduler_type: cosine
56
  - lr_scheduler_warmup_ratio: 0.1
57
- - num_epochs: 4.0
58
 
59
  ### Training results
60
 
61
  | Training Loss | Epoch | Step | Validation Loss |
62
  |:-------------:|:-----:|:----:|:---------------:|
63
- | 0.2002 | 0.99 | 37 | 0.2199 |
64
- | 0.1723 | 2.0 | 75 | 0.2118 |
65
- | 0.1156 | 2.99 | 112 | 0.2252 |
66
- | 0.0804 | 3.95 | 148 | 0.2436 |
67
 
68
 
69
  ### Framework versions
 
2
  license: mit
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
 
 
 
 
8
  base_model: HuggingFaceH4/zephyr-7b-beta
9
  model-index:
10
  - name: skills_metaphor_chat-skills_red_herring_chat-lora
 
16
 
17
  # skills_metaphor_chat-skills_red_herring_chat-lora
18
 
19
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.2247
22
 
23
  ## Model description
24
 
 
47
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
  - lr_scheduler_type: cosine
49
  - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 1.0
51
 
52
  ### Training results
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
+ | 0.203 | 0.99 | 37 | 0.2247 |
 
 
 
57
 
58
 
59
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 3.95,
3
- "eval_loss": 0.2436458021402359,
4
- "eval_runtime": 7.9233,
5
  "eval_samples": 200,
6
- "eval_samples_per_second": 25.242,
7
- "eval_steps_per_second": 3.155,
8
- "train_loss": 0.2733316945063101,
9
- "train_runtime": 797.9653,
10
  "train_samples": 600,
11
- "train_samples_per_second": 3.008,
12
- "train_steps_per_second": 0.185
13
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "eval_loss": 0.22467635571956635,
4
+ "eval_runtime": 8.1549,
5
  "eval_samples": 200,
6
+ "eval_samples_per_second": 24.525,
7
+ "eval_steps_per_second": 3.066,
8
+ "train_loss": 0.5330296472923176,
9
+ "train_runtime": 203.4112,
10
  "train_samples": 600,
11
+ "train_samples_per_second": 2.95,
12
+ "train_steps_per_second": 0.182
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.95,
3
- "eval_loss": 0.2436458021402359,
4
- "eval_runtime": 7.9233,
5
  "eval_samples": 200,
6
- "eval_samples_per_second": 25.242,
7
- "eval_steps_per_second": 3.155
8
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "eval_loss": 0.22467635571956635,
4
+ "eval_runtime": 8.1549,
5
  "eval_samples": 200,
6
+ "eval_samples_per_second": 24.525,
7
+ "eval_steps_per_second": 3.066
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.95,
3
- "train_loss": 0.2733316945063101,
4
- "train_runtime": 797.9653,
5
  "train_samples": 600,
6
- "train_samples_per_second": 3.008,
7
- "train_steps_per_second": 0.185
8
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "train_loss": 0.5330296472923176,
4
+ "train_runtime": 203.4112,
5
  "train_samples": 600,
6
+ "train_samples_per_second": 2.95,
7
+ "train_steps_per_second": 0.182
8
  }
trainer_state.json CHANGED
@@ -1,241 +1,85 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.9466666666666668,
5
  "eval_steps": 500,
6
- "global_step": 148,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "learning_rate": 1.3333333333333333e-05,
14
  "loss": 2.2275,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.13,
19
- "learning_rate": 6.666666666666667e-05,
20
- "loss": 2.2349,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.27,
25
- "learning_rate": 0.00013333333333333334,
26
- "loss": 1.4549,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.4,
31
- "learning_rate": 0.0002,
32
- "loss": 0.4894,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.53,
37
- "learning_rate": 0.00019930337092856243,
38
- "loss": 0.269,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.67,
43
- "learning_rate": 0.00019722318955551306,
44
- "loss": 0.2445,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.8,
49
- "learning_rate": 0.00019378843817721854,
50
- "loss": 0.2318,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.93,
55
- "learning_rate": 0.00018904697174694447,
56
- "loss": 0.2002,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.99,
61
- "eval_loss": 0.21994143724441528,
62
- "eval_runtime": 8.8446,
63
- "eval_samples_per_second": 22.613,
64
- "eval_steps_per_second": 2.827,
65
  "step": 37
66
  },
67
  {
68
- "epoch": 1.07,
69
- "learning_rate": 0.0001830648511318223,
70
- "loss": 0.2005,
71
- "step": 40
72
- },
73
- {
74
- "epoch": 1.2,
75
- "learning_rate": 0.00017592542271443887,
76
- "loss": 0.1843,
77
- "step": 45
78
- },
79
- {
80
- "epoch": 1.33,
81
- "learning_rate": 0.00016772815716257412,
82
- "loss": 0.1765,
83
- "step": 50
84
- },
85
- {
86
- "epoch": 1.47,
87
- "learning_rate": 0.00015858726354602248,
88
- "loss": 0.1796,
89
- "step": 55
90
- },
91
- {
92
- "epoch": 1.6,
93
- "learning_rate": 0.00014863009810942815,
94
- "loss": 0.1765,
95
- "step": 60
96
- },
97
- {
98
- "epoch": 1.73,
99
- "learning_rate": 0.000137995389871036,
100
- "loss": 0.1685,
101
- "step": 65
102
- },
103
- {
104
- "epoch": 1.87,
105
- "learning_rate": 0.0001268313077693485,
106
- "loss": 0.1616,
107
- "step": 70
108
- },
109
- {
110
- "epoch": 2.0,
111
- "learning_rate": 0.0001152933962873246,
112
- "loss": 0.1723,
113
- "step": 75
114
- },
115
- {
116
- "epoch": 2.0,
117
- "eval_loss": 0.21179074048995972,
118
- "eval_runtime": 7.9847,
119
- "eval_samples_per_second": 25.048,
120
- "eval_steps_per_second": 3.131,
121
- "step": 75
122
- },
123
- {
124
- "epoch": 2.13,
125
- "learning_rate": 0.00010354240831620541,
126
- "loss": 0.1375,
127
- "step": 80
128
- },
129
- {
130
- "epoch": 2.27,
131
- "learning_rate": 9.174206545276677e-05,
132
- "loss": 0.1257,
133
- "step": 85
134
- },
135
- {
136
- "epoch": 2.4,
137
- "learning_rate": 8.005677693484077e-05,
138
- "loss": 0.1262,
139
- "step": 90
140
- },
141
- {
142
- "epoch": 2.53,
143
- "learning_rate": 6.864934899622191e-05,
144
- "loss": 0.1298,
145
- "step": 95
146
- },
147
- {
148
- "epoch": 2.67,
149
- "learning_rate": 5.767871655555751e-05,
150
- "loss": 0.1154,
151
- "step": 100
152
- },
153
- {
154
- "epoch": 2.8,
155
- "learning_rate": 4.729772884265212e-05,
156
- "loss": 0.12,
157
- "step": 105
158
- },
159
- {
160
- "epoch": 2.93,
161
- "learning_rate": 3.7651019814126654e-05,
162
- "loss": 0.1156,
163
- "step": 110
164
- },
165
- {
166
- "epoch": 2.99,
167
- "eval_loss": 0.2252231389284134,
168
- "eval_runtime": 8.0616,
169
- "eval_samples_per_second": 24.809,
170
- "eval_steps_per_second": 3.101,
171
- "step": 112
172
- },
173
- {
174
- "epoch": 3.07,
175
- "learning_rate": 2.8872993029040508e-05,
176
- "loss": 0.1084,
177
- "step": 115
178
- },
179
- {
180
- "epoch": 3.2,
181
- "learning_rate": 2.1085949060360654e-05,
182
- "loss": 0.0907,
183
- "step": 120
184
- },
185
- {
186
- "epoch": 3.33,
187
- "learning_rate": 1.439838153227e-05,
188
- "loss": 0.0856,
189
- "step": 125
190
- },
191
- {
192
- "epoch": 3.47,
193
- "learning_rate": 8.903465523913957e-06,
194
- "loss": 0.087,
195
- "step": 130
196
- },
197
- {
198
- "epoch": 3.6,
199
- "learning_rate": 4.6777594000230855e-06,
200
- "loss": 0.0818,
201
- "step": 135
202
- },
203
- {
204
- "epoch": 3.73,
205
- "learning_rate": 1.7801381552624563e-06,
206
- "loss": 0.0932,
207
- "step": 140
208
- },
209
- {
210
- "epoch": 3.87,
211
- "learning_rate": 2.509731335744281e-07,
212
- "loss": 0.0804,
213
- "step": 145
214
- },
215
- {
216
- "epoch": 3.95,
217
- "eval_loss": 0.2436458021402359,
218
- "eval_runtime": 7.8963,
219
- "eval_samples_per_second": 25.328,
220
- "eval_steps_per_second": 3.166,
221
- "step": 148
222
- },
223
- {
224
- "epoch": 3.95,
225
- "step": 148,
226
- "total_flos": 82630925025280.0,
227
- "train_loss": 0.2733316945063101,
228
- "train_runtime": 797.9653,
229
- "train_samples_per_second": 3.008,
230
- "train_steps_per_second": 0.185
231
  }
232
  ],
233
  "logging_steps": 5,
234
- "max_steps": 148,
235
  "num_input_tokens_seen": 0,
236
- "num_train_epochs": 4,
237
  "save_steps": 500,
238
- "total_flos": 82630925025280.0,
239
  "train_batch_size": 4,
240
  "trial_name": null,
241
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9866666666666667,
5
  "eval_steps": 500,
6
+ "global_step": 37,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "learning_rate": 5e-05,
14
  "loss": 2.2275,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.13,
19
+ "learning_rate": 0.00019954719225730847,
20
+ "loss": 1.9848,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.27,
25
+ "learning_rate": 0.00018412535328311814,
26
+ "loss": 0.6529,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.4,
31
+ "learning_rate": 0.00015000000000000001,
32
+ "loss": 0.2662,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.53,
37
+ "learning_rate": 0.00010475819158237425,
38
+ "loss": 0.238,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.67,
43
+ "learning_rate": 5.845849869981137e-05,
44
+ "loss": 0.2332,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.8,
49
+ "learning_rate": 2.139469052572127e-05,
50
+ "loss": 0.2287,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.93,
55
+ "learning_rate": 1.8071302737293295e-06,
56
+ "loss": 0.203,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.99,
61
+ "eval_loss": 0.22467635571956635,
62
+ "eval_runtime": 9.0559,
63
+ "eval_samples_per_second": 22.085,
64
+ "eval_steps_per_second": 2.761,
65
  "step": 37
66
  },
67
  {
68
+ "epoch": 0.99,
69
+ "step": 37,
70
+ "total_flos": 20586009395200.0,
71
+ "train_loss": 0.5330296472923176,
72
+ "train_runtime": 203.4112,
73
+ "train_samples_per_second": 2.95,
74
+ "train_steps_per_second": 0.182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  }
76
  ],
77
  "logging_steps": 5,
78
+ "max_steps": 37,
79
  "num_input_tokens_seen": 0,
80
+ "num_train_epochs": 1,
81
  "save_steps": 500,
82
+ "total_flos": 20586009395200.0,
83
  "train_batch_size": 4,
84
  "trial_name": null,
85
  "trial_params": null