YYT-t commited on
Commit
b27fb6c
·
verified ·
1 Parent(s): 0d95f9f

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +7 -7
  3. train_results.json +7 -7
  4. trainer_state.json +253 -21
README.md CHANGED
@@ -43,8 +43,8 @@ The following hyperparameters were used during training:
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
  - num_devices: 4
46
- - gradient_accumulation_steps: 2
47
- - total_train_batch_size: 32
48
  - total_eval_batch_size: 8
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
 
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
  - num_devices: 4
46
+ - gradient_accumulation_steps: 4
47
+ - total_train_batch_size: 64
48
  - total_eval_batch_size: 8
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 238169358336.0,
4
- "train_loss": 0.41675474246342975,
5
- "train_runtime": 375.9187,
6
- "train_samples": 55,
7
- "train_samples_per_second": 0.048,
8
- "train_steps_per_second": 0.008
9
  }
 
1
  {
2
+ "epoch": 2.938775510204082,
3
+ "total_flos": 17029109121024.0,
4
+ "train_loss": 0.5480542282263438,
5
+ "train_runtime": 761.6539,
6
+ "train_samples": 7473,
7
+ "train_samples_per_second": 3.08,
8
+ "train_steps_per_second": 0.047
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 238169358336.0,
4
- "train_loss": 0.41675474246342975,
5
- "train_runtime": 375.9187,
6
- "train_samples": 55,
7
- "train_samples_per_second": 0.048,
8
- "train_steps_per_second": 0.008
9
  }
 
1
  {
2
+ "epoch": 2.938775510204082,
3
+ "total_flos": 17029109121024.0,
4
+ "train_loss": 0.5480542282263438,
5
+ "train_runtime": 761.6539,
6
+ "train_samples": 7473,
7
+ "train_samples_per_second": 3.08,
8
+ "train_steps_per_second": 0.047
9
  }
trainer_state.json CHANGED
@@ -1,45 +1,277 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 3,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "learning_rate": 2e-05,
14
- "loss": 0.42,
 
15
  "step": 1
16
  },
17
  {
18
- "epoch": 2.0,
19
- "grad_norm": 11.450737697662918,
20
  "learning_rate": 1e-05,
21
- "loss": 0.4248,
22
  "step": 2
23
  },
24
  {
25
- "epoch": 3.0,
26
- "grad_norm": 11.450737697662918,
27
- "learning_rate": 0.0,
28
- "loss": 0.4055,
29
  "step": 3
30
  },
31
  {
32
- "epoch": 3.0,
33
- "step": 3,
34
- "total_flos": 238169358336.0,
35
- "train_loss": 0.41675474246342975,
36
- "train_runtime": 375.9187,
37
- "train_samples_per_second": 0.048,
38
- "train_steps_per_second": 0.008
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
40
  ],
41
  "logging_steps": 1,
42
- "max_steps": 3,
43
  "num_input_tokens_seen": 0,
44
  "num_train_epochs": 3,
45
  "save_steps": 999999,
@@ -55,7 +287,7 @@
55
  "attributes": {}
56
  }
57
  },
58
- "total_flos": 238169358336.0,
59
  "train_batch_size": 4,
60
  "trial_name": null,
61
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.938775510204082,
5
  "eval_steps": 500,
6
+ "global_step": 36,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08163265306122448,
13
+ "grad_norm": 12.871264777961246,
14
+ "learning_rate": 5e-06,
15
+ "loss": 1.0334,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.16326530612244897,
20
+ "grad_norm": 12.879485643828415,
21
  "learning_rate": 1e-05,
22
+ "loss": 1.0411,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.24489795918367346,
27
+ "grad_norm": 5.758679119848915,
28
+ "learning_rate": 1.5000000000000002e-05,
29
+ "loss": 0.8346,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.32653061224489793,
34
+ "grad_norm": 5.732568718456503,
35
+ "learning_rate": 2e-05,
36
+ "loss": 0.7962,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.40816326530612246,
41
+ "grad_norm": 2.8475781014063286,
42
+ "learning_rate": 1.995184726672197e-05,
43
+ "loss": 0.7126,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.4897959183673469,
48
+ "grad_norm": 7.835179324031208,
49
+ "learning_rate": 1.9807852804032306e-05,
50
+ "loss": 0.904,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.5714285714285714,
55
+ "grad_norm": 6.113481875959908,
56
+ "learning_rate": 1.956940335732209e-05,
57
+ "loss": 0.7776,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.6530612244897959,
62
+ "grad_norm": 2.349354822354672,
63
+ "learning_rate": 1.9238795325112867e-05,
64
+ "loss": 0.7284,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.7346938775510204,
69
+ "grad_norm": 1.579653340304688,
70
+ "learning_rate": 1.881921264348355e-05,
71
+ "loss": 0.6916,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.8163265306122449,
76
+ "grad_norm": 1.4021343131954482,
77
+ "learning_rate": 1.8314696123025456e-05,
78
+ "loss": 0.6583,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.8979591836734694,
83
+ "grad_norm": 1.1807754387171556,
84
+ "learning_rate": 1.773010453362737e-05,
85
+ "loss": 0.6475,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.9795918367346939,
90
+ "grad_norm": 1.1614963968104302,
91
+ "learning_rate": 1.7071067811865477e-05,
92
+ "loss": 0.6229,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 1.0612244897959184,
97
+ "grad_norm": 1.128917404557287,
98
+ "learning_rate": 1.6343932841636455e-05,
99
+ "loss": 0.57,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 1.1428571428571428,
104
+ "grad_norm": 0.8876639818635172,
105
+ "learning_rate": 1.5555702330196024e-05,
106
+ "loss": 0.5341,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 1.2244897959183674,
111
+ "grad_norm": 0.9997328188865856,
112
+ "learning_rate": 1.4713967368259981e-05,
113
+ "loss": 0.5163,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 1.306122448979592,
118
+ "grad_norm": 0.9067476498467109,
119
+ "learning_rate": 1.3826834323650899e-05,
120
+ "loss": 0.5097,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 1.3877551020408163,
125
+ "grad_norm": 0.8513924391122798,
126
+ "learning_rate": 1.2902846772544625e-05,
127
+ "loss": 0.5065,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 1.469387755102041,
132
+ "grad_norm": 0.8831816847725055,
133
+ "learning_rate": 1.1950903220161286e-05,
134
+ "loss": 0.4946,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 1.5510204081632653,
139
+ "grad_norm": 0.8139866475964539,
140
+ "learning_rate": 1.098017140329561e-05,
141
+ "loss": 0.4947,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 1.6326530612244898,
146
+ "grad_norm": 0.7341998608817863,
147
+ "learning_rate": 1e-05,
148
+ "loss": 0.4791,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 1.7142857142857144,
153
+ "grad_norm": 0.7598602977041824,
154
+ "learning_rate": 9.019828596704394e-06,
155
+ "loss": 0.4908,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 1.7959183673469388,
160
+ "grad_norm": 0.7075477867390706,
161
+ "learning_rate": 8.04909677983872e-06,
162
+ "loss": 0.4792,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 1.8775510204081631,
167
+ "grad_norm": 0.6719521698836661,
168
+ "learning_rate": 7.097153227455379e-06,
169
+ "loss": 0.4763,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 1.9591836734693877,
174
+ "grad_norm": 0.6830946946064212,
175
+ "learning_rate": 6.173165676349103e-06,
176
+ "loss": 0.4807,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 2.0408163265306123,
181
+ "grad_norm": 0.7628156550515836,
182
+ "learning_rate": 5.286032631740023e-06,
183
+ "loss": 0.4347,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 2.122448979591837,
188
+ "grad_norm": 0.9800830997160824,
189
+ "learning_rate": 4.444297669803981e-06,
190
+ "loss": 0.3682,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 2.204081632653061,
195
+ "grad_norm": 0.8438420954533535,
196
+ "learning_rate": 3.6560671583635467e-06,
197
+ "loss": 0.3713,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 2.2857142857142856,
202
+ "grad_norm": 0.7540847452922566,
203
+ "learning_rate": 2.9289321881345257e-06,
204
+ "loss": 0.354,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 2.36734693877551,
209
+ "grad_norm": 0.7383497973747206,
210
+ "learning_rate": 2.26989546637263e-06,
211
+ "loss": 0.3506,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 2.4489795918367347,
216
+ "grad_norm": 0.7441143965623213,
217
+ "learning_rate": 1.6853038769745466e-06,
218
+ "loss": 0.349,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 2.5306122448979593,
223
+ "grad_norm": 0.7973081129823184,
224
+ "learning_rate": 1.1807873565164507e-06,
225
+ "loss": 0.3393,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 2.612244897959184,
230
+ "grad_norm": 0.8432676382740615,
231
+ "learning_rate": 7.612046748871327e-07,
232
+ "loss": 0.3389,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 2.693877551020408,
237
+ "grad_norm": 0.8547630649684965,
238
+ "learning_rate": 4.305966426779118e-07,
239
+ "loss": 0.3508,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 2.7755102040816326,
244
+ "grad_norm": 0.8293429149325029,
245
+ "learning_rate": 1.921471959676957e-07,
246
+ "loss": 0.3346,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 2.857142857142857,
251
+ "grad_norm": 0.8148052667107787,
252
+ "learning_rate": 4.815273327803183e-08,
253
+ "loss": 0.3251,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 2.938775510204082,
258
+ "grad_norm": 0.8381390002935672,
259
+ "learning_rate": 0.0,
260
+ "loss": 0.3332,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 2.938775510204082,
265
+ "step": 36,
266
+ "total_flos": 17029109121024.0,
267
+ "train_loss": 0.5480542282263438,
268
+ "train_runtime": 761.6539,
269
+ "train_samples_per_second": 3.08,
270
+ "train_steps_per_second": 0.047
271
  }
272
  ],
273
  "logging_steps": 1,
274
+ "max_steps": 36,
275
  "num_input_tokens_seen": 0,
276
  "num_train_epochs": 3,
277
  "save_steps": 999999,
 
287
  "attributes": {}
288
  }
289
  },
290
+ "total_flos": 17029109121024.0,
291
  "train_batch_size": 4,
292
  "trial_name": null,
293
  "trial_params": null