CreitinGameplays commited on
Commit
66c76ee
·
verified ·
1 Parent(s): f22147d

Upload 13 files

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "train_loss": 0.19717555102511977,
4
- "train_runtime": 3239.6632,
5
- "train_samples": 1519,
6
- "train_samples_per_second": 4.689,
7
- "train_steps_per_second": 0.522
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.005102622340929632,
4
+ "train_runtime": 880.3838,
5
+ "train_samples": 11608,
6
+ "train_samples_per_second": 13.185,
7
+ "train_steps_per_second": 1.648
8
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "CreitinGameplays/elisa-chan-gpt2-medium",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
1
  {
2
+ "_name_or_path": "CreitinGameplays/elisa-chan-gpt2-medium-v2",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9040b992a0cbaf8bf990ea4f82cb8ff641087e9d95a28dd2c28e8729b0ef170e
3
  size 1419343360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6604bdf2888448ccee1b8bb082e57f4d988fe372e944c2821db86dd804eceb04
3
  size 1419343360
tokenizer.json CHANGED
@@ -1,7 +1,21 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 50256,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 341,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 341
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 50257,
16
+ "pad_type_id": 0,
17
+ "pad_token": "[PAD]"
18
+ },
19
  "added_tokens": [
20
  {
21
  "id": 50256,
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "train_loss": 0.19717555102511977,
4
- "train_runtime": 3239.6632,
5
- "train_samples": 1519,
6
- "train_samples_per_second": 4.689,
7
- "train_steps_per_second": 0.522
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.005102622340929632,
4
+ "train_runtime": 880.3838,
5
+ "train_samples": 11608,
6
+ "train_samples_per_second": 13.185,
7
+ "train_steps_per_second": 1.648
8
  }
trainer_state.json CHANGED
@@ -1,358 +1,217 @@
1
  {
2
- "best_metric": 6.297296047210693,
3
- "best_model_checkpoint": "./output_dir/checkpoint-169",
4
- "epoch": 10.0,
5
  "eval_steps": 50.0,
6
- "global_step": 1690,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.3,
13
- "learning_rate": 9.70414201183432e-05,
14
- "loss": 1.5791,
15
  "step": 50
16
  },
17
  {
18
- "epoch": 0.59,
19
- "learning_rate": 9.408284023668639e-05,
20
- "loss": 0.6789,
21
  "step": 100
22
  },
23
  {
24
- "epoch": 0.89,
25
- "learning_rate": 9.112426035502959e-05,
26
- "loss": 0.5388,
27
  "step": 150
28
  },
29
  {
30
- "epoch": 1.0,
31
- "eval_bleu-1": 20.7834,
32
- "eval_bleu-2": 4.6208,
33
- "eval_bleu-3": 2.9929,
34
- "eval_bleu-4": 2.6572,
35
- "eval_gen_len": 178.3333,
36
- "eval_loss": 6.297296047210693,
37
- "eval_runtime": 43.3624,
38
- "eval_samples_per_second": 0.277,
39
- "eval_steps_per_second": 0.277,
40
- "step": 169
41
- },
42
- {
43
- "epoch": 1.18,
44
- "learning_rate": 8.816568047337278e-05,
45
- "loss": 0.3897,
46
  "step": 200
47
  },
48
  {
49
- "epoch": 1.48,
50
- "learning_rate": 8.520710059171599e-05,
51
- "loss": 0.3148,
52
  "step": 250
53
  },
54
  {
55
- "epoch": 1.78,
56
- "learning_rate": 8.224852071005918e-05,
57
- "loss": 0.2681,
58
  "step": 300
59
  },
60
  {
61
- "epoch": 2.0,
62
- "eval_bleu-1": 21.2042,
63
- "eval_bleu-2": 4.6329,
64
- "eval_bleu-3": 2.8663,
65
- "eval_bleu-4": 2.5449,
66
- "eval_gen_len": 178.3333,
67
- "eval_loss": 7.237356662750244,
68
- "eval_runtime": 42.1638,
69
- "eval_samples_per_second": 0.285,
70
- "eval_steps_per_second": 0.285,
71
- "step": 338
72
- },
73
- {
74
- "epoch": 2.07,
75
- "learning_rate": 7.928994082840237e-05,
76
- "loss": 0.2515,
77
  "step": 350
78
  },
79
  {
80
- "epoch": 2.37,
81
- "learning_rate": 7.633136094674557e-05,
82
- "loss": 0.17,
83
  "step": 400
84
  },
85
  {
86
- "epoch": 2.66,
87
- "learning_rate": 7.337278106508876e-05,
88
- "loss": 0.1873,
89
  "step": 450
90
  },
91
  {
92
- "epoch": 2.96,
93
- "learning_rate": 7.041420118343195e-05,
94
- "loss": 0.1854,
95
  "step": 500
96
  },
97
  {
98
- "epoch": 3.0,
99
- "eval_bleu-1": 20.1899,
100
- "eval_bleu-2": 4.4735,
101
- "eval_bleu-3": 2.9681,
102
- "eval_bleu-4": 2.5725,
103
- "eval_gen_len": 178.3333,
104
- "eval_loss": 7.7388596534729,
105
- "eval_runtime": 43.0592,
106
- "eval_samples_per_second": 0.279,
107
- "eval_steps_per_second": 0.279,
108
- "step": 507
109
- },
110
- {
111
- "epoch": 3.25,
112
- "learning_rate": 6.745562130177515e-05,
113
- "loss": 0.1395,
114
  "step": 550
115
  },
116
  {
117
- "epoch": 3.55,
118
- "learning_rate": 6.449704142011834e-05,
119
- "loss": 0.1228,
120
  "step": 600
121
  },
122
  {
123
- "epoch": 3.85,
124
- "learning_rate": 6.153846153846155e-05,
125
- "loss": 0.1209,
126
  "step": 650
127
  },
128
  {
129
- "epoch": 4.0,
130
- "eval_bleu-1": 22.4915,
131
- "eval_bleu-2": 5.2106,
132
- "eval_bleu-3": 3.0103,
133
- "eval_bleu-4": 2.4905,
134
- "eval_gen_len": 178.3333,
135
- "eval_loss": 8.396303176879883,
136
- "eval_runtime": 42.158,
137
- "eval_samples_per_second": 0.285,
138
- "eval_steps_per_second": 0.285,
139
- "step": 676
140
- },
141
- {
142
- "epoch": 4.14,
143
- "learning_rate": 5.863905325443787e-05,
144
- "loss": 0.1079,
145
  "step": 700
146
  },
147
  {
148
- "epoch": 4.44,
149
- "learning_rate": 5.568047337278107e-05,
150
- "loss": 0.1073,
151
  "step": 750
152
  },
153
  {
154
- "epoch": 4.73,
155
- "learning_rate": 5.272189349112427e-05,
156
- "loss": 0.0988,
157
  "step": 800
158
  },
159
  {
160
- "epoch": 5.0,
161
- "eval_bleu-1": 20.999,
162
- "eval_bleu-2": 5.0478,
163
- "eval_bleu-3": 3.0062,
164
- "eval_bleu-4": 2.5449,
165
- "eval_gen_len": 178.3333,
166
- "eval_loss": 8.886907577514648,
167
- "eval_runtime": 43.2859,
168
- "eval_samples_per_second": 0.277,
169
- "eval_steps_per_second": 0.277,
170
- "step": 845
171
- },
172
- {
173
- "epoch": 5.03,
174
- "learning_rate": 4.976331360946746e-05,
175
- "loss": 0.0991,
176
  "step": 850
177
  },
178
  {
179
- "epoch": 5.33,
180
- "learning_rate": 4.6804733727810654e-05,
181
- "loss": 0.0864,
182
  "step": 900
183
  },
184
  {
185
- "epoch": 5.62,
186
- "learning_rate": 4.384615384615385e-05,
187
- "loss": 0.0882,
188
  "step": 950
189
  },
190
  {
191
- "epoch": 5.92,
192
- "learning_rate": 4.088757396449705e-05,
193
- "loss": 0.0883,
194
  "step": 1000
195
  },
196
  {
197
- "epoch": 6.0,
198
- "eval_bleu-1": 22.8262,
199
- "eval_bleu-2": 4.6692,
200
- "eval_bleu-3": 2.9419,
201
- "eval_bleu-4": 2.4905,
202
- "eval_gen_len": 178.3333,
203
- "eval_loss": 9.158414840698242,
204
- "eval_runtime": 42.9666,
205
- "eval_samples_per_second": 0.279,
206
- "eval_steps_per_second": 0.279,
207
- "step": 1014
208
- },
209
- {
210
- "epoch": 6.21,
211
- "learning_rate": 3.792899408284024e-05,
212
- "loss": 0.0856,
213
  "step": 1050
214
  },
215
  {
216
- "epoch": 6.51,
217
- "learning_rate": 3.4970414201183435e-05,
218
- "loss": 0.0786,
219
  "step": 1100
220
  },
221
  {
222
- "epoch": 6.8,
223
- "learning_rate": 3.201183431952663e-05,
224
- "loss": 0.0806,
225
  "step": 1150
226
  },
227
  {
228
- "epoch": 7.0,
229
- "eval_bleu-1": 22.2906,
230
- "eval_bleu-2": 4.6692,
231
- "eval_bleu-3": 2.9419,
232
- "eval_bleu-4": 2.4905,
233
- "eval_gen_len": 178.3333,
234
- "eval_loss": 9.391822814941406,
235
- "eval_runtime": 42.3982,
236
- "eval_samples_per_second": 0.283,
237
- "eval_steps_per_second": 0.283,
238
- "step": 1183
239
- },
240
- {
241
- "epoch": 7.1,
242
- "learning_rate": 2.9053254437869826e-05,
243
- "loss": 0.0786,
244
  "step": 1200
245
  },
246
  {
247
- "epoch": 7.4,
248
- "learning_rate": 2.6094674556213016e-05,
249
- "loss": 0.0765,
250
  "step": 1250
251
  },
252
  {
253
- "epoch": 7.69,
254
- "learning_rate": 2.3136094674556213e-05,
255
- "loss": 0.0747,
256
  "step": 1300
257
  },
258
  {
259
- "epoch": 7.99,
260
- "learning_rate": 2.017751479289941e-05,
261
- "loss": 0.0787,
262
  "step": 1350
263
  },
264
  {
265
- "epoch": 8.0,
266
- "eval_bleu-1": 23.8184,
267
- "eval_bleu-2": 5.1303,
268
- "eval_bleu-3": 2.9739,
269
- "eval_bleu-4": 2.6574,
270
- "eval_gen_len": 178.3333,
271
- "eval_loss": 9.662525177001953,
272
- "eval_runtime": 42.8982,
273
- "eval_samples_per_second": 0.28,
274
- "eval_steps_per_second": 0.28,
275
- "step": 1352
276
- },
277
- {
278
- "epoch": 8.28,
279
- "learning_rate": 1.7218934911242603e-05,
280
- "loss": 0.0734,
281
  "step": 1400
282
  },
283
  {
284
- "epoch": 8.58,
285
- "learning_rate": 1.42603550295858e-05,
286
- "loss": 0.0744,
287
  "step": 1450
288
  },
289
  {
290
- "epoch": 8.88,
291
- "learning_rate": 1.1301775147928994e-05,
292
- "loss": 0.0714,
293
- "step": 1500
294
- },
295
- {
296
- "epoch": 9.0,
297
- "eval_bleu-1": 23.7162,
298
- "eval_bleu-2": 5.522,
299
- "eval_bleu-3": 3.0388,
300
- "eval_bleu-4": 2.7154,
301
- "eval_gen_len": 178.3333,
302
- "eval_loss": 9.930649757385254,
303
- "eval_runtime": 45.1011,
304
- "eval_samples_per_second": 0.266,
305
- "eval_steps_per_second": 0.266,
306
- "step": 1521
307
- },
308
- {
309
- "epoch": 9.17,
310
- "learning_rate": 8.34319526627219e-06,
311
- "loss": 0.0724,
312
- "step": 1550
313
- },
314
- {
315
- "epoch": 9.47,
316
- "learning_rate": 5.3846153846153855e-06,
317
- "loss": 0.0704,
318
- "step": 1600
319
- },
320
- {
321
- "epoch": 9.76,
322
- "learning_rate": 2.42603550295858e-06,
323
- "loss": 0.0706,
324
- "step": 1650
325
- },
326
- {
327
- "epoch": 10.0,
328
- "eval_bleu-1": 24.1567,
329
- "eval_bleu-2": 5.3356,
330
- "eval_bleu-3": 2.9739,
331
- "eval_bleu-4": 2.6574,
332
- "eval_gen_len": 178.3333,
333
- "eval_loss": 9.996184349060059,
334
- "eval_runtime": 44.2153,
335
- "eval_samples_per_second": 0.271,
336
- "eval_steps_per_second": 0.271,
337
- "step": 1690
338
  },
339
  {
340
- "epoch": 10.0,
341
- "step": 1690,
342
- "total_flos": 7053481774940160.0,
343
- "train_loss": 0.19717555102511977,
344
- "train_runtime": 3239.6632,
345
- "train_samples_per_second": 4.689,
346
- "train_steps_per_second": 0.522
347
  }
348
  ],
349
  "logging_steps": 50,
350
- "max_steps": 1690,
351
  "num_input_tokens_seen": 0,
352
- "num_train_epochs": 10,
353
  "save_steps": 500,
354
- "total_flos": 7053481774940160.0,
355
- "train_batch_size": 3,
356
  "trial_name": null,
357
  "trial_params": null
358
  }
 
1
  {
2
+ "best_metric": 9.541932106018066,
3
+ "best_model_checkpoint": "./output_dir/checkpoint-1451",
4
+ "epoch": 1.0,
5
  "eval_steps": 50.0,
6
+ "global_step": 1451,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "learning_rate": 9.662301860785666e-05,
14
+ "loss": 0.1431,
15
  "step": 50
16
  },
17
  {
18
+ "epoch": 0.07,
19
+ "learning_rate": 9.317711922811855e-05,
20
+ "loss": 0.0002,
21
  "step": 100
22
  },
23
  {
24
+ "epoch": 0.1,
25
+ "learning_rate": 8.973121984838044e-05,
26
+ "loss": 0.0,
27
  "step": 150
28
  },
29
  {
30
+ "epoch": 0.14,
31
+ "learning_rate": 8.628532046864232e-05,
32
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  "step": 200
34
  },
35
  {
36
+ "epoch": 0.17,
37
+ "learning_rate": 8.283942108890421e-05,
38
+ "loss": 0.0,
39
  "step": 250
40
  },
41
  {
42
+ "epoch": 0.21,
43
+ "learning_rate": 7.939352170916609e-05,
44
+ "loss": 0.0,
45
  "step": 300
46
  },
47
  {
48
+ "epoch": 0.24,
49
+ "learning_rate": 7.594762232942798e-05,
50
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  "step": 350
52
  },
53
  {
54
+ "epoch": 0.28,
55
+ "learning_rate": 7.250172294968988e-05,
56
+ "loss": 0.0,
57
  "step": 400
58
  },
59
  {
60
+ "epoch": 0.31,
61
+ "learning_rate": 6.905582356995176e-05,
62
+ "loss": 0.0,
63
  "step": 450
64
  },
65
  {
66
+ "epoch": 0.34,
67
+ "learning_rate": 6.560992419021364e-05,
68
+ "loss": 0.0002,
69
  "step": 500
70
  },
71
  {
72
+ "epoch": 0.38,
73
+ "learning_rate": 6.216402481047554e-05,
74
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  "step": 550
76
  },
77
  {
78
+ "epoch": 0.41,
79
+ "learning_rate": 5.871812543073743e-05,
80
+ "loss": 0.0,
81
  "step": 600
82
  },
83
  {
84
+ "epoch": 0.45,
85
+ "learning_rate": 5.527222605099931e-05,
86
+ "loss": 0.0,
87
  "step": 650
88
  },
89
  {
90
+ "epoch": 0.48,
91
+ "learning_rate": 5.18263266712612e-05,
92
+ "loss": 0.0038,
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  "step": 700
94
  },
95
  {
96
+ "epoch": 0.52,
97
+ "learning_rate": 4.838042729152309e-05,
98
+ "loss": 0.0001,
99
  "step": 750
100
  },
101
  {
102
+ "epoch": 0.55,
103
+ "learning_rate": 4.493452791178498e-05,
104
+ "loss": 0.0,
105
  "step": 800
106
  },
107
  {
108
+ "epoch": 0.59,
109
+ "learning_rate": 4.1488628532046864e-05,
110
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  "step": 850
112
  },
113
  {
114
+ "epoch": 0.62,
115
+ "learning_rate": 3.8042729152308755e-05,
116
+ "loss": 0.0002,
117
  "step": 900
118
  },
119
  {
120
+ "epoch": 0.65,
121
+ "learning_rate": 3.459682977257065e-05,
122
+ "loss": 0.0001,
123
  "step": 950
124
  },
125
  {
126
+ "epoch": 0.69,
127
+ "learning_rate": 3.115093039283253e-05,
128
+ "loss": 0.0,
129
  "step": 1000
130
  },
131
  {
132
+ "epoch": 0.72,
133
+ "learning_rate": 2.770503101309442e-05,
134
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  "step": 1050
136
  },
137
  {
138
+ "epoch": 0.76,
139
+ "learning_rate": 2.4259131633356307e-05,
140
+ "loss": 0.0,
141
  "step": 1100
142
  },
143
  {
144
+ "epoch": 0.79,
145
+ "learning_rate": 2.0813232253618195e-05,
146
+ "loss": 0.0,
147
  "step": 1150
148
  },
149
  {
150
+ "epoch": 0.83,
151
+ "learning_rate": 1.7367332873880083e-05,
152
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  "step": 1200
154
  },
155
  {
156
+ "epoch": 0.86,
157
+ "learning_rate": 1.3921433494141973e-05,
158
+ "loss": 0.0,
159
  "step": 1250
160
  },
161
  {
162
+ "epoch": 0.9,
163
+ "learning_rate": 1.047553411440386e-05,
164
+ "loss": 0.0,
165
  "step": 1300
166
  },
167
  {
168
+ "epoch": 0.93,
169
+ "learning_rate": 7.029634734665748e-06,
170
+ "loss": 0.0,
171
  "step": 1350
172
  },
173
  {
174
+ "epoch": 0.96,
175
+ "learning_rate": 3.5837353549276364e-06,
176
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  "step": 1400
178
  },
179
  {
180
+ "epoch": 1.0,
181
+ "learning_rate": 1.3783597518952448e-07,
182
+ "loss": 0.0001,
183
  "step": 1450
184
  },
185
  {
186
+ "epoch": 1.0,
187
+ "eval_bleu-1": 9.8455,
188
+ "eval_bleu-2": 1.8537,
189
+ "eval_bleu-3": 1.0957,
190
+ "eval_bleu-4": 1.0496,
191
+ "eval_gen_len": 264.3333,
192
+ "eval_loss": 9.541932106018066,
193
+ "eval_runtime": 46.9425,
194
+ "eval_samples_per_second": 0.256,
195
+ "eval_steps_per_second": 0.256,
196
+ "step": 1451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  },
198
  {
199
+ "epoch": 1.0,
200
+ "step": 1451,
201
+ "total_flos": 7179886646034432.0,
202
+ "train_loss": 0.005102622340929632,
203
+ "train_runtime": 880.3838,
204
+ "train_samples_per_second": 13.185,
205
+ "train_steps_per_second": 1.648
206
  }
207
  ],
208
  "logging_steps": 50,
209
+ "max_steps": 1451,
210
  "num_input_tokens_seen": 0,
211
+ "num_train_epochs": 1,
212
  "save_steps": 500,
213
+ "total_flos": 7179886646034432.0,
214
+ "train_batch_size": 4,
215
  "trial_name": null,
216
  "trial_params": null
217
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7754cb059b5c0c296af00e860e5d33b1f3ed3fb3b768dd9ef00fbe2ddf76d9e6
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83ca1eca1f0c5f9a14cb769aab447d8465e825b45d54f5d3743d297d7ea46659
3
  size 5048