NightMachinery commited on
Commit
7883bab
1 Parent(s): a85ca50

Training in progress, step 500

Browse files
last-checkpoint/mlm/adapter_config.json CHANGED
@@ -6,8 +6,8 @@
6
  "factorized_phm_rule": false,
7
  "hypercomplex_nonlinearity": "glorot-uniform",
8
  "init_weights": "bert",
9
- "inv_adapter": "nice",
10
- "inv_adapter_reduction_factor": 2,
11
  "is_parallel": false,
12
  "learn_phm": true,
13
  "leave_out": [],
@@ -31,11 +31,11 @@
31
  "shared_phm_rule": true,
32
  "use_gating": false
33
  },
34
- "config_id": "9ed5b5a29de19b71",
35
  "hidden_size": 768,
36
  "model_class": "XLMRobertaForMaskedLM",
37
  "model_name": "xlm-roberta-base",
38
  "model_type": "xlm-roberta",
39
  "name": "mlm",
40
- "version": "3.2.0a0"
41
  }
 
6
  "factorized_phm_rule": false,
7
  "hypercomplex_nonlinearity": "glorot-uniform",
8
  "init_weights": "bert",
9
+ "inv_adapter": null,
10
+ "inv_adapter_reduction_factor": null,
11
  "is_parallel": false,
12
  "learn_phm": true,
13
  "leave_out": [],
 
31
  "shared_phm_rule": true,
32
  "use_gating": false
33
  },
34
+ "config_id": "9076f36a74755ac4",
35
  "hidden_size": 768,
36
  "model_class": "XLMRobertaForMaskedLM",
37
  "model_name": "xlm-roberta-base",
38
  "model_type": "xlm-roberta",
39
  "name": "mlm",
40
+ "version": "3.2.0"
41
  }
last-checkpoint/mlm/head_config.json CHANGED
@@ -10,5 +10,5 @@
10
  "model_type": "xlm-roberta",
11
  "name": null,
12
  "num_labels": 2,
13
- "version": "3.2.0a0"
14
  }
 
10
  "model_type": "xlm-roberta",
11
  "name": null,
12
  "num_labels": 2,
13
+ "version": "3.2.0"
14
  }
last-checkpoint/mlm/pytorch_adapter.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab31d3c1e44d96d6aa7dc385bb2a374bb3650411b1f2bbf3a5632cf6d8a364dd
3
- size 4782029
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992a56d9cea8f923f3ae5835ae0ccb36e78cff465c9cac4eb543e13a4bc8cb54
3
+ size 3595045
last-checkpoint/mlm/pytorch_model_head.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5a68f21c96d1017414c75ab9cb39667b15070c234626b171849d4374c8815f2
3
  size 771377007
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78f3543c2b9f6c7d47b62337c5e0cfcbd52b25c634208cda9e8dc74c39b2ef1b
3
  size 771377007
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c879fb2ff43e64a5330a90d7721d2020e0ba4b2bfa608256284bd1443f844131
3
- size 16297733
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b99ed3e4deac97eb97d586252063d8831cadaf749bd0bd6a167055c47c2e279
3
+ size 13924549
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9699ad8a13d39e0088b256bc19bede304b6e109e99940bf7c075c82acfc186a
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef12f40a043f312c84af952a05fcc8c7e366ecdd0c0d9a6c44fad7c01b79ac67
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13f27483992a93e2d0e00be6d9edd9ab2fa3162b3b09ab3826e3fa33ba45c5fe
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dd53c40ad142c1d4952384818db07da2a2df2a1a18ce830debd7791a2e791d2
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,331 +1,31 @@
1
  {
2
- "best_metric": 1.9753360748291016,
3
- "best_model_checkpoint": "./test-mlm/checkpoint-10500",
4
- "epoch": 7.766272189349112,
5
- "global_step": 10500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.37,
12
- "learning_rate": 9.6301775147929e-05,
13
- "loss": 2.703,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.37,
18
- "eval_accuracy": 0.5554450522128295,
19
- "eval_loss": 2.2368199825286865,
20
- "eval_runtime": 81.9171,
21
- "eval_samples_per_second": 10.608,
22
- "eval_steps_per_second": 2.124,
23
  "step": 500
24
- },
25
- {
26
- "epoch": 0.74,
27
- "learning_rate": 9.260355029585799e-05,
28
- "loss": 2.34,
29
- "step": 1000
30
- },
31
- {
32
- "epoch": 0.74,
33
- "eval_accuracy": 0.5654470537208005,
34
- "eval_loss": 2.155290126800537,
35
- "eval_runtime": 81.9029,
36
- "eval_samples_per_second": 10.61,
37
- "eval_steps_per_second": 2.124,
38
- "step": 1000
39
- },
40
- {
41
- "epoch": 1.11,
42
- "learning_rate": 8.8905325443787e-05,
43
- "loss": 2.3064,
44
- "step": 1500
45
- },
46
- {
47
- "epoch": 1.11,
48
- "eval_accuracy": 0.5682274716205484,
49
- "eval_loss": 2.143535614013672,
50
- "eval_runtime": 81.8866,
51
- "eval_samples_per_second": 10.612,
52
- "eval_steps_per_second": 2.125,
53
- "step": 1500
54
- },
55
- {
56
- "epoch": 1.48,
57
- "learning_rate": 8.520710059171599e-05,
58
- "loss": 2.2737,
59
- "step": 2000
60
- },
61
- {
62
- "epoch": 1.48,
63
- "eval_accuracy": 0.5682253766925845,
64
- "eval_loss": 2.127504825592041,
65
- "eval_runtime": 81.8392,
66
- "eval_samples_per_second": 10.618,
67
- "eval_steps_per_second": 2.126,
68
- "step": 2000
69
- },
70
- {
71
- "epoch": 1.85,
72
- "learning_rate": 8.150887573964498e-05,
73
- "loss": 2.2323,
74
- "step": 2500
75
- },
76
- {
77
- "epoch": 1.85,
78
- "eval_accuracy": 0.57444453106728,
79
- "eval_loss": 2.0887880325317383,
80
- "eval_runtime": 81.8536,
81
- "eval_samples_per_second": 10.617,
82
- "eval_steps_per_second": 2.126,
83
- "step": 2500
84
- },
85
- {
86
- "epoch": 2.22,
87
- "learning_rate": 7.781065088757397e-05,
88
- "loss": 2.2227,
89
- "step": 3000
90
- },
91
- {
92
- "epoch": 2.22,
93
- "eval_accuracy": 0.577296072741943,
94
- "eval_loss": 2.077665328979492,
95
- "eval_runtime": 82.1231,
96
- "eval_samples_per_second": 10.582,
97
- "eval_steps_per_second": 2.119,
98
- "step": 3000
99
- },
100
- {
101
- "epoch": 2.59,
102
- "learning_rate": 7.411242603550296e-05,
103
- "loss": 2.1994,
104
- "step": 3500
105
- },
106
- {
107
- "epoch": 2.59,
108
- "eval_accuracy": 0.5799091007802442,
109
- "eval_loss": 2.0537660121917725,
110
- "eval_runtime": 81.9221,
111
- "eval_samples_per_second": 10.608,
112
- "eval_steps_per_second": 2.124,
113
- "step": 3500
114
- },
115
- {
116
- "epoch": 2.96,
117
- "learning_rate": 7.041420118343195e-05,
118
- "loss": 2.1832,
119
- "step": 4000
120
- },
121
- {
122
- "epoch": 2.96,
123
- "eval_accuracy": 0.5793303771456824,
124
- "eval_loss": 2.0551059246063232,
125
- "eval_runtime": 81.9029,
126
- "eval_samples_per_second": 10.61,
127
- "eval_steps_per_second": 2.124,
128
- "step": 4000
129
- },
130
- {
131
- "epoch": 3.33,
132
- "learning_rate": 6.671597633136095e-05,
133
- "loss": 2.179,
134
- "step": 4500
135
- },
136
- {
137
- "epoch": 3.33,
138
- "eval_accuracy": 0.5803635456614598,
139
- "eval_loss": 2.0518393516540527,
140
- "eval_runtime": 81.8891,
141
- "eval_samples_per_second": 10.612,
142
- "eval_steps_per_second": 2.125,
143
- "step": 4500
144
- },
145
- {
146
- "epoch": 3.7,
147
- "learning_rate": 6.301775147928994e-05,
148
- "loss": 2.1749,
149
- "step": 5000
150
- },
151
- {
152
- "epoch": 3.7,
153
- "eval_accuracy": 0.5847466866015668,
154
- "eval_loss": 2.0224971771240234,
155
- "eval_runtime": 81.4891,
156
- "eval_samples_per_second": 10.664,
157
- "eval_steps_per_second": 2.135,
158
- "step": 5000
159
- },
160
- {
161
- "epoch": 4.07,
162
- "learning_rate": 5.931952662721894e-05,
163
- "loss": 2.1485,
164
- "step": 5500
165
- },
166
- {
167
- "epoch": 4.07,
168
- "eval_accuracy": 0.5841869298546032,
169
- "eval_loss": 2.0166282653808594,
170
- "eval_runtime": 81.9288,
171
- "eval_samples_per_second": 10.607,
172
- "eval_steps_per_second": 2.124,
173
- "step": 5500
174
- },
175
- {
176
- "epoch": 4.44,
177
- "learning_rate": 5.562130177514793e-05,
178
- "loss": 2.1386,
179
- "step": 6000
180
- },
181
- {
182
- "epoch": 4.44,
183
- "eval_accuracy": 0.584889026370737,
184
- "eval_loss": 2.0117151737213135,
185
- "eval_runtime": 82.1419,
186
- "eval_samples_per_second": 10.579,
187
- "eval_steps_per_second": 2.118,
188
- "step": 6000
189
- },
190
- {
191
- "epoch": 4.81,
192
- "learning_rate": 5.192307692307693e-05,
193
- "loss": 2.1342,
194
- "step": 6500
195
- },
196
- {
197
- "epoch": 4.81,
198
- "eval_accuracy": 0.5851434826804285,
199
- "eval_loss": 2.019951343536377,
200
- "eval_runtime": 81.9564,
201
- "eval_samples_per_second": 10.603,
202
- "eval_steps_per_second": 2.123,
203
- "step": 6500
204
- },
205
- {
206
- "epoch": 5.18,
207
- "learning_rate": 4.822485207100592e-05,
208
- "loss": 2.14,
209
- "step": 7000
210
- },
211
- {
212
- "epoch": 5.18,
213
- "eval_accuracy": 0.5875654429927449,
214
- "eval_loss": 1.9929033517837524,
215
- "eval_runtime": 81.8893,
216
- "eval_samples_per_second": 10.612,
217
- "eval_steps_per_second": 2.125,
218
- "step": 7000
219
- },
220
- {
221
- "epoch": 5.55,
222
- "learning_rate": 4.452662721893491e-05,
223
- "loss": 2.1245,
224
- "step": 7500
225
- },
226
- {
227
- "epoch": 5.55,
228
- "eval_accuracy": 0.5850476823443949,
229
- "eval_loss": 2.0177462100982666,
230
- "eval_runtime": 81.9426,
231
- "eval_samples_per_second": 10.605,
232
- "eval_steps_per_second": 2.123,
233
- "step": 7500
234
- },
235
- {
236
- "epoch": 5.92,
237
- "learning_rate": 4.0828402366863904e-05,
238
- "loss": 2.1144,
239
- "step": 8000
240
- },
241
- {
242
- "epoch": 5.92,
243
- "eval_accuracy": 0.5845387459583223,
244
- "eval_loss": 2.0020604133605957,
245
- "eval_runtime": 81.8258,
246
- "eval_samples_per_second": 10.62,
247
- "eval_steps_per_second": 2.126,
248
- "step": 8000
249
- },
250
- {
251
- "epoch": 6.29,
252
- "learning_rate": 3.71301775147929e-05,
253
- "loss": 2.1204,
254
- "step": 8500
255
- },
256
- {
257
- "epoch": 6.29,
258
- "eval_accuracy": 0.5902023229674035,
259
- "eval_loss": 1.9979915618896484,
260
- "eval_runtime": 82.1267,
261
- "eval_samples_per_second": 10.581,
262
- "eval_steps_per_second": 2.119,
263
- "step": 8500
264
- },
265
- {
266
- "epoch": 6.66,
267
- "learning_rate": 3.3431952662721895e-05,
268
- "loss": 2.1183,
269
- "step": 9000
270
- },
271
- {
272
- "epoch": 6.66,
273
- "eval_accuracy": 0.5892121344051747,
274
- "eval_loss": 1.978848934173584,
275
- "eval_runtime": 81.8016,
276
- "eval_samples_per_second": 10.623,
277
- "eval_steps_per_second": 2.127,
278
- "step": 9000
279
- },
280
- {
281
- "epoch": 7.03,
282
- "learning_rate": 2.973372781065089e-05,
283
- "loss": 2.1074,
284
- "step": 9500
285
- },
286
- {
287
- "epoch": 7.03,
288
- "eval_accuracy": 0.5906417360616661,
289
- "eval_loss": 1.9904392957687378,
290
- "eval_runtime": 81.9488,
291
- "eval_samples_per_second": 10.604,
292
- "eval_steps_per_second": 2.123,
293
- "step": 9500
294
- },
295
- {
296
- "epoch": 7.4,
297
- "learning_rate": 2.6035502958579882e-05,
298
- "loss": 2.1021,
299
- "step": 10000
300
- },
301
- {
302
- "epoch": 7.4,
303
- "eval_accuracy": 0.5926501920654422,
304
- "eval_loss": 1.9878432750701904,
305
- "eval_runtime": 81.8234,
306
- "eval_samples_per_second": 10.62,
307
- "eval_steps_per_second": 2.127,
308
- "step": 10000
309
- },
310
- {
311
- "epoch": 7.77,
312
- "learning_rate": 2.2337278106508877e-05,
313
- "loss": 2.0887,
314
- "step": 10500
315
- },
316
- {
317
- "epoch": 7.77,
318
- "eval_accuracy": 0.5924927994053703,
319
- "eval_loss": 1.9753360748291016,
320
- "eval_runtime": 81.7861,
321
- "eval_samples_per_second": 10.625,
322
- "eval_steps_per_second": 2.128,
323
- "step": 10500
324
  }
325
  ],
326
- "max_steps": 13520,
327
- "num_train_epochs": 10,
328
- "total_flos": 1.4038178060648448e+16,
329
  "trial_name": null,
330
  "trial_params": null
331
  }
 
1
  {
2
+ "best_metric": 2.2929649353027344,
3
+ "best_model_checkpoint": "./test-mlm/checkpoint-500",
4
+ "epoch": 0.3698224852071006,
5
+ "global_step": 500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.37,
12
+ "learning_rate": 9.537721893491125e-05,
13
+ "loss": 3.2297,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.37,
18
+ "eval_accuracy": 0.5468516907011437,
19
+ "eval_loss": 2.2929649353027344,
20
+ "eval_runtime": 82.9497,
21
+ "eval_samples_per_second": 10.476,
22
+ "eval_steps_per_second": 2.098,
23
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
  ],
26
+ "max_steps": 10816,
27
+ "num_train_epochs": 8,
28
+ "total_flos": 666567613440000.0,
29
  "trial_name": null,
30
  "trial_params": null
31
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1df2b6641708b0e7a1eb6d8e9bc7c8eb16223fbde2ebdc920843d9462399a795
3
  size 3515
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8d50ec052e6f796bf886cada4bb73f925473feb40d57987148846389ad2c56a
3
  size 3515
runs/Mar10_16-33-29_9eaed4381315/1678466242.7558188/events.out.tfevents.1678466242.9eaed4381315.3748.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c931764ed73a625cc227c99f29888058d8f8b558252fc6504c4792f6e1ad3545
3
+ size 5686
runs/Mar10_16-33-29_9eaed4381315/events.out.tfevents.1678466242.9eaed4381315.3748.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:340c4bc5ca082ce5ce4c5208eaafe235f70cc6cd65d29c38ae65e8e538bd0f58
3
+ size 5495
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1df2b6641708b0e7a1eb6d8e9bc7c8eb16223fbde2ebdc920843d9462399a795
3
  size 3515
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8d50ec052e6f796bf886cada4bb73f925473feb40d57987148846389ad2c56a
3
  size 3515