thomnis commited on
Commit
5d137be
·
verified ·
1 Parent(s): 33af8ff

Training in progress, step 4770

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4384a128a0ec6df3743bda704006b555752b73cff6bd957895b0f805429a080f
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb3d9d9b7904e5c4b2b4e5f8e2e04d0d88e148d3c050c56c1c175fa004a5e190
3
  size 268290900
run-3/checkpoint-4770/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a48fa33a9ddcf76f32827f42a4c29afa8a6efe0a1ecb59eaa05fc1a5eba8800
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb3d9d9b7904e5c4b2b4e5f8e2e04d0d88e148d3c050c56c1c175fa004a5e190
3
  size 268290900
run-3/checkpoint-4770/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f2aafd8a795d1fc83963def144a6849dff9bf7eb36503883d209ce78d989deb
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d273212006c666110dba1b4525d9539d8db16079acce1aa3213059046e396c29
3
  size 536643898
run-3/checkpoint-4770/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc053df02028bafb95728822603492c92e361e1a2f98460c7483ca185bec87f8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:797c8d7d26e9180dee2526f838ce1bd0f7cdff0bf714d2114e0bcc548438283e
3
  size 1064
run-3/checkpoint-4770/trainer_state.json CHANGED
@@ -10,233 +10,233 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.01754792593419552,
14
- "learning_rate": 0.000776604254595881,
15
- "loss": 0.5977,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.0064516129032258064,
21
- "eval_loss": 0.6016289591789246,
22
- "eval_runtime": 5.4207,
23
- "eval_samples_per_second": 571.886,
24
- "eval_steps_per_second": 11.991,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.013247409835457802,
30
- "learning_rate": 0.0007213193795213482,
31
- "loss": 0.6075,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
  "eval_accuracy": 0.03225806451612903,
37
- "eval_loss": 0.6012811064720154,
38
- "eval_runtime": 5.4611,
39
- "eval_samples_per_second": 567.648,
40
- "eval_steps_per_second": 11.902,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.012003601528704166,
46
- "learning_rate": 0.0006660345044468155,
47
- "loss": 0.6077,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
  "eval_accuracy": 0.03225806451612903,
53
- "eval_loss": 0.6012278199195862,
54
- "eval_runtime": 5.4297,
55
- "eval_samples_per_second": 570.937,
56
- "eval_steps_per_second": 11.971,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.010578780435025692,
62
- "learning_rate": 0.0006107496293722828,
63
- "loss": 0.6072,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
  "eval_accuracy": 0.03225806451612903,
69
- "eval_loss": 0.6011785268783569,
70
- "eval_runtime": 5.4145,
71
- "eval_samples_per_second": 572.54,
72
- "eval_steps_per_second": 12.005,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.01368038635700941,
78
- "learning_rate": 0.00055546475429775,
79
- "loss": 0.608,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
  "eval_accuracy": 0.03225806451612903,
85
- "eval_loss": 0.6011925339698792,
86
- "eval_runtime": 5.4677,
87
- "eval_samples_per_second": 566.968,
88
- "eval_steps_per_second": 11.888,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.01561660785228014,
94
- "learning_rate": 0.0005001798792232172,
95
- "loss": 0.6076,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
  "eval_accuracy": 0.03225806451612903,
101
- "eval_loss": 0.6011727452278137,
102
- "eval_runtime": 5.4651,
103
- "eval_samples_per_second": 567.234,
104
- "eval_steps_per_second": 11.894,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.012493623420596123,
110
- "learning_rate": 0.00044489500414868454,
111
- "loss": 0.607,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
  "eval_accuracy": 0.03225806451612903,
117
- "eval_loss": 0.6012096405029297,
118
- "eval_runtime": 5.3992,
119
- "eval_samples_per_second": 574.154,
120
- "eval_steps_per_second": 12.039,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.014549925923347473,
126
- "learning_rate": 0.00038961012907415184,
127
- "loss": 0.6078,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
  "eval_accuracy": 0.03225806451612903,
133
- "eval_loss": 0.6011644601821899,
134
- "eval_runtime": 5.4037,
135
- "eval_samples_per_second": 573.681,
136
- "eval_steps_per_second": 12.029,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.02066616527736187,
142
- "learning_rate": 0.0003343252539996191,
143
- "loss": 0.6075,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
  "eval_accuracy": 0.03225806451612903,
149
- "eval_loss": 0.6011798977851868,
150
- "eval_runtime": 5.475,
151
- "eval_samples_per_second": 566.208,
152
- "eval_steps_per_second": 11.872,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
- "grad_norm": 0.010850013233721256,
158
- "learning_rate": 0.00027904037892508633,
159
- "loss": 0.6076,
160
  "step": 3170
161
  },
162
  {
163
  "epoch": 10.0,
164
  "eval_accuracy": 0.03225806451612903,
165
- "eval_loss": 0.6012014746665955,
166
- "eval_runtime": 5.4388,
167
- "eval_samples_per_second": 569.982,
168
- "eval_steps_per_second": 11.951,
169
  "step": 3180
170
  },
171
  {
172
  "epoch": 10.965408805031446,
173
- "grad_norm": 0.01570860482752323,
174
- "learning_rate": 0.00022375550385055363,
175
- "loss": 0.6077,
176
  "step": 3487
177
  },
178
  {
179
  "epoch": 11.0,
180
  "eval_accuracy": 0.03225806451612903,
181
- "eval_loss": 0.6011857390403748,
182
- "eval_runtime": 5.4108,
183
- "eval_samples_per_second": 572.928,
184
- "eval_steps_per_second": 12.013,
185
  "step": 3498
186
  },
187
  {
188
  "epoch": 11.962264150943396,
189
- "grad_norm": 0.007834916934370995,
190
- "learning_rate": 0.00016847062877602088,
191
- "loss": 0.6072,
192
  "step": 3804
193
  },
194
  {
195
  "epoch": 12.0,
196
  "eval_accuracy": 0.03225806451612903,
197
- "eval_loss": 0.6011877655982971,
198
- "eval_runtime": 5.4133,
199
- "eval_samples_per_second": 572.658,
200
- "eval_steps_per_second": 12.007,
201
  "step": 3816
202
  },
203
  {
204
  "epoch": 12.959119496855346,
205
- "grad_norm": 0.015295589342713356,
206
- "learning_rate": 0.00011318575370148815,
207
- "loss": 0.6081,
208
  "step": 4121
209
  },
210
  {
211
  "epoch": 13.0,
212
  "eval_accuracy": 0.03225806451612903,
213
- "eval_loss": 0.6011701226234436,
214
- "eval_runtime": 5.4283,
215
- "eval_samples_per_second": 571.085,
216
- "eval_steps_per_second": 11.974,
217
  "step": 4134
218
  },
219
  {
220
  "epoch": 13.955974842767295,
221
- "grad_norm": 0.014071750454604626,
222
- "learning_rate": 5.790087862695542e-05,
223
- "loss": 0.607,
224
  "step": 4438
225
  },
226
  {
227
  "epoch": 14.0,
228
  "eval_accuracy": 0.03225806451612903,
229
- "eval_loss": 0.6011898517608643,
230
- "eval_runtime": 5.4906,
231
- "eval_samples_per_second": 564.599,
232
- "eval_steps_per_second": 11.838,
233
  "step": 4452
234
  },
235
  {
236
  "epoch": 14.952830188679245,
237
- "grad_norm": 0.011002879589796066,
238
- "learning_rate": 2.6160035524226845e-06,
239
- "loss": 0.6074,
240
  "step": 4755
241
  }
242
  ],
@@ -261,11 +261,11 @@
261
  "train_batch_size": 48,
262
  "trial_name": null,
263
  "trial_params": {
264
- "alpha": 0.3738796870532265,
265
- "learning_rate": 0.0008318891296704137,
266
- "lr_scheduler_type": "linear",
267
  "num_train_epochs": 15,
268
- "temperature": 10.526715944436805,
269
- "weight_decay": 0.22690902065757862
270
  }
271
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.007859878242015839,
14
+ "learning_rate": 0.0007934093547399718,
15
+ "loss": 0.5931,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.03225806451612903,
21
+ "eval_loss": 0.5877403020858765,
22
+ "eval_runtime": 5.3692,
23
+ "eval_samples_per_second": 577.366,
24
+ "eval_steps_per_second": 12.106,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.013005654327571392,
30
+ "learning_rate": 0.0007676597858899992,
31
+ "loss": 0.5933,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
  "eval_accuracy": 0.03225806451612903,
37
+ "eval_loss": 0.5876849293708801,
38
+ "eval_runtime": 5.3979,
39
+ "eval_samples_per_second": 574.293,
40
+ "eval_steps_per_second": 12.042,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.011686289682984352,
46
+ "learning_rate": 0.0007259882616863973,
47
+ "loss": 0.5936,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
  "eval_accuracy": 0.03225806451612903,
53
+ "eval_loss": 0.5876944661140442,
54
+ "eval_runtime": 5.3376,
55
+ "eval_samples_per_second": 580.786,
56
+ "eval_steps_per_second": 12.178,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.010250881314277649,
62
+ "learning_rate": 0.0006702046329072582,
63
+ "loss": 0.5932,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
  "eval_accuracy": 0.03225806451612903,
69
+ "eval_loss": 0.5876566171646118,
70
+ "eval_runtime": 5.3623,
71
+ "eval_samples_per_second": 578.11,
72
+ "eval_steps_per_second": 12.122,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.013469184748828411,
78
+ "learning_rate": 0.0006027316581600536,
79
+ "loss": 0.594,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
  "eval_accuracy": 0.03225806451612903,
85
+ "eval_loss": 0.5876731276512146,
86
+ "eval_runtime": 5.3565,
87
+ "eval_samples_per_second": 578.738,
88
+ "eval_steps_per_second": 12.135,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.015176467597484589,
94
+ "learning_rate": 0.0005264997801914848,
95
+ "loss": 0.5936,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
  "eval_accuracy": 0.03225806451612903,
101
+ "eval_loss": 0.5876610279083252,
102
+ "eval_runtime": 5.3967,
103
+ "eval_samples_per_second": 574.42,
104
+ "eval_steps_per_second": 12.044,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.01225706934928894,
110
+ "learning_rate": 0.0004448198527870465,
111
+ "loss": 0.593,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
  "eval_accuracy": 0.03225806451612903,
117
+ "eval_loss": 0.5876200795173645,
118
+ "eval_runtime": 5.386,
119
+ "eval_samples_per_second": 575.564,
120
+ "eval_steps_per_second": 12.068,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.014281037263572216,
126
+ "learning_rate": 0.00036123934590356535,
127
+ "loss": 0.5938,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
  "eval_accuracy": 0.03225806451612903,
133
+ "eval_loss": 0.5876643061637878,
134
+ "eval_runtime": 5.34,
135
+ "eval_samples_per_second": 580.525,
136
+ "eval_steps_per_second": 12.172,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.020413335412740707,
142
+ "learning_rate": 0.0002793882742407039,
143
+ "loss": 0.5934,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
  "eval_accuracy": 0.03225806451612903,
149
+ "eval_loss": 0.5876378417015076,
150
+ "eval_runtime": 5.3343,
151
+ "eval_samples_per_second": 581.141,
152
+ "eval_steps_per_second": 12.185,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
+ "grad_norm": 0.010601122863590717,
158
+ "learning_rate": 0.00020282154078240177,
159
+ "loss": 0.5935,
160
  "step": 3170
161
  },
162
  {
163
  "epoch": 10.0,
164
  "eval_accuracy": 0.03225806451612903,
165
+ "eval_loss": 0.5876396298408508,
166
+ "eval_runtime": 5.3546,
167
+ "eval_samples_per_second": 578.942,
168
+ "eval_steps_per_second": 12.139,
169
  "step": 3180
170
  },
171
  {
172
  "epoch": 10.965408805031446,
173
+ "grad_norm": 0.015482204966247082,
174
+ "learning_rate": 0.00013486454254193946,
175
+ "loss": 0.5936,
176
  "step": 3487
177
  },
178
  {
179
  "epoch": 11.0,
180
  "eval_accuracy": 0.03225806451612903,
181
+ "eval_loss": 0.5875952839851379,
182
+ "eval_runtime": 5.3966,
183
+ "eval_samples_per_second": 574.436,
184
+ "eval_steps_per_second": 12.045,
185
  "step": 3498
186
  },
187
  {
188
  "epoch": 11.962264150943396,
189
+ "grad_norm": 0.007544202264398336,
190
+ "learning_rate": 7.846874406237966e-05,
191
+ "loss": 0.5932,
192
  "step": 3804
193
  },
194
  {
195
  "epoch": 12.0,
196
  "eval_accuracy": 0.03225806451612903,
197
+ "eval_loss": 0.587660014629364,
198
+ "eval_runtime": 5.388,
199
+ "eval_samples_per_second": 575.35,
200
+ "eval_steps_per_second": 12.064,
201
  "step": 3816
202
  },
203
  {
204
  "epoch": 12.959119496855346,
205
+ "grad_norm": 0.015037346631288528,
206
+ "learning_rate": 3.608349131102299e-05,
207
+ "loss": 0.594,
208
  "step": 4121
209
  },
210
  {
211
  "epoch": 13.0,
212
  "eval_accuracy": 0.03225806451612903,
213
+ "eval_loss": 0.5875839591026306,
214
+ "eval_runtime": 5.3407,
215
+ "eval_samples_per_second": 580.45,
216
+ "eval_steps_per_second": 12.171,
217
  "step": 4134
218
  },
219
  {
220
  "epoch": 13.955974842767295,
221
+ "grad_norm": 0.013815987855196,
222
+ "learning_rate": 9.549633264184268e-06,
223
+ "loss": 0.593,
224
  "step": 4438
225
  },
226
  {
227
  "epoch": 14.0,
228
  "eval_accuracy": 0.03225806451612903,
229
+ "eval_loss": 0.5875993371009827,
230
+ "eval_runtime": 5.3398,
231
+ "eval_samples_per_second": 580.541,
232
+ "eval_steps_per_second": 12.173,
233
  "step": 4452
234
  },
235
  {
236
  "epoch": 14.952830188679245,
237
+ "grad_norm": 0.010695732198655605,
238
+ "learning_rate": 1.9571341049241364e-08,
239
+ "loss": 0.5934,
240
  "step": 4755
241
  }
242
  ],
 
261
  "train_batch_size": 48,
262
  "trial_name": null,
263
  "trial_params": {
264
+ "alpha": 0.5457565605433671,
265
+ "learning_rate": 0.0008021186295599815,
266
+ "lr_scheduler_type": "cosine",
267
  "num_train_epochs": 15,
268
+ "temperature": 13.83793993486481,
269
+ "weight_decay": 0.09136269626429569
270
  }
271
  }
run-3/checkpoint-4770/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:250abb006b19366e1adab089f1609d1d21d51fe8eff45a07f7a4f1d918f262d8
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:897f112baf000af50343a07b648e18d94170f7867db99d9989dad76b8d4ae6a7
3
  size 5240
runs/Oct20_13-24-54_87443764e281/events.out.tfevents.1729433136.87443764e281.307.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:742b6d0d21ad38e949126f20c1628a16142201b4bae170fb91e5943c5cd936f2
3
+ size 20825
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5589e749649dba3af878a3715400f3ce342a88ed8dcc73e44b015459960a4ca
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:897f112baf000af50343a07b648e18d94170f7867db99d9989dad76b8d4ae6a7
3
  size 5240