leinms commited on
Commit
3e665d7
·
verified ·
1 Parent(s): f26ad4c

Initial model upload

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +154 -154
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efd763dfe8e85c866e6bd66f50809d9d54abce16f01859bae8100bc92ed5f69e
3
  size 1740320848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3795d032986cb36ab2a79c0c82499665666044e317693a78fd89b681f6fbebf8
3
  size 1740320848
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09bc7a04cb1c04fd2bf390ada3ea92d968b99d16473546daf9bf211097dd45d0
3
  size 210016058
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7959cbf37e29a25017956c46760b2e5398aca6b65f51db96eeebc8e94430f44
3
  size 210016058
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ec1c04ea978711253e617f503c8e9a897c2b598c232debda0e8460cead38768
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db173f9b674e0e63d8230619f05ad2fb75dff4f47266517ae9a9ea1de60145b8
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2966c532a4eaa51eb66d10c1d97075ad2d534019e8d79c43ed0464fef1781368
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eff7c54f170c5a8667bfd6c31a3c0808bcd3818f330877116e19201be586e8f
3
  size 1064
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.03780783340334892,
3
- "best_model_checkpoint": "./results3/checkpoint-2400",
4
- "epoch": 3.864734299516908,
5
  "eval_steps": 200,
6
  "global_step": 2400,
7
  "is_hyper_param_search": false,
@@ -9,236 +9,236 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.322061191626409,
13
- "grad_norm": 7.461069583892822,
14
  "learning_rate": 6.666666666666667e-07,
15
- "loss": 1.8012,
16
  "step": 200
17
  },
18
  {
19
- "epoch": 0.322061191626409,
20
- "eval_accuracy": 0.18971061093247588,
21
- "eval_f1": 0.10189049057972092,
22
- "eval_loss": 1.7812345027923584,
23
- "eval_precision": 0.12844515990912814,
24
- "eval_recall": 0.18971061093247588,
25
- "eval_runtime": 5.7225,
26
- "eval_samples_per_second": 108.693,
27
- "eval_steps_per_second": 27.261,
28
  "step": 200
29
  },
30
  {
31
- "epoch": 0.644122383252818,
32
- "grad_norm": 6.918825149536133,
33
  "learning_rate": 1.3333333333333334e-06,
34
- "loss": 1.7688,
35
  "step": 400
36
  },
37
  {
38
- "epoch": 0.644122383252818,
39
- "eval_accuracy": 0.26688102893890675,
40
- "eval_f1": 0.21143076934540506,
41
- "eval_loss": 1.7484790086746216,
42
- "eval_precision": 0.48251637799958924,
43
- "eval_recall": 0.26688102893890675,
44
- "eval_runtime": 5.841,
45
- "eval_samples_per_second": 106.489,
46
- "eval_steps_per_second": 26.708,
47
  "step": 400
48
  },
49
  {
50
- "epoch": 0.966183574879227,
51
- "grad_norm": 6.153536319732666,
52
  "learning_rate": 2.0000000000000003e-06,
53
- "loss": 1.7475,
54
  "step": 600
55
  },
56
  {
57
- "epoch": 0.966183574879227,
58
- "eval_accuracy": 0.5353697749196141,
59
- "eval_f1": 0.5292630942880905,
60
- "eval_loss": 1.6678913831710815,
61
- "eval_precision": 0.5777595640628179,
62
- "eval_recall": 0.5353697749196141,
63
- "eval_runtime": 5.7676,
64
- "eval_samples_per_second": 107.844,
65
- "eval_steps_per_second": 27.048,
66
  "step": 600
67
  },
68
  {
69
- "epoch": 1.288244766505636,
70
- "grad_norm": 10.906112670898438,
71
  "learning_rate": 2.666666666666667e-06,
72
- "loss": 1.6323,
73
  "step": 800
74
  },
75
  {
76
- "epoch": 1.288244766505636,
77
- "eval_accuracy": 0.6109324758842444,
78
- "eval_f1": 0.584022217728586,
79
- "eval_loss": 1.4518650770187378,
80
- "eval_precision": 0.6770832871589993,
81
- "eval_recall": 0.6109324758842444,
82
- "eval_runtime": 5.7252,
83
- "eval_samples_per_second": 108.643,
84
- "eval_steps_per_second": 27.248,
85
  "step": 800
86
  },
87
  {
88
- "epoch": 1.6103059581320451,
89
- "grad_norm": 11.212413787841797,
90
  "learning_rate": 3.3333333333333333e-06,
91
- "loss": 1.3569,
92
  "step": 1000
93
  },
94
  {
95
- "epoch": 1.6103059581320451,
96
- "eval_accuracy": 0.6672025723472669,
97
- "eval_f1": 0.6448086526662313,
98
- "eval_loss": 1.0713452100753784,
99
- "eval_precision": 0.7227579722788608,
100
- "eval_recall": 0.6672025723472669,
101
- "eval_runtime": 5.8441,
102
- "eval_samples_per_second": 106.433,
103
- "eval_steps_per_second": 26.694,
104
  "step": 1000
105
  },
106
  {
107
- "epoch": 1.9323671497584543,
108
- "grad_norm": 10.077208518981934,
109
  "learning_rate": 4.000000000000001e-06,
110
- "loss": 0.9744,
111
  "step": 1200
112
  },
113
  {
114
- "epoch": 1.9323671497584543,
115
- "eval_accuracy": 0.8360128617363344,
116
- "eval_f1": 0.8313516450563994,
117
- "eval_loss": 0.6114147901535034,
118
- "eval_precision": 0.8485064229080279,
119
- "eval_recall": 0.8360128617363344,
120
- "eval_runtime": 5.7667,
121
- "eval_samples_per_second": 107.861,
122
- "eval_steps_per_second": 27.052,
123
  "step": 1200
124
  },
125
  {
126
- "epoch": 2.2544283413848634,
127
- "grad_norm": 3.6660408973693848,
128
  "learning_rate": 4.666666666666667e-06,
129
- "loss": 0.5969,
130
  "step": 1400
131
  },
132
  {
133
- "epoch": 2.2544283413848634,
134
- "eval_accuracy": 0.9180064308681672,
135
- "eval_f1": 0.9171911311305204,
136
- "eval_loss": 0.29916083812713623,
137
- "eval_precision": 0.9217784712222669,
138
- "eval_recall": 0.9180064308681672,
139
- "eval_runtime": 5.8166,
140
- "eval_samples_per_second": 106.936,
141
- "eval_steps_per_second": 26.82,
142
  "step": 1400
143
  },
144
  {
145
- "epoch": 2.576489533011272,
146
- "grad_norm": 3.6452574729919434,
147
- "learning_rate": 4.994440868783523e-06,
148
- "loss": 0.3187,
149
  "step": 1600
150
  },
151
  {
152
- "epoch": 2.576489533011272,
153
- "eval_accuracy": 0.9453376205787781,
154
- "eval_f1": 0.94523522766866,
155
- "eval_loss": 0.1684405654668808,
156
- "eval_precision": 0.951570696538466,
157
- "eval_recall": 0.9453376205787781,
158
- "eval_runtime": 5.7215,
159
- "eval_samples_per_second": 108.712,
160
- "eval_steps_per_second": 27.265,
161
  "step": 1600
162
  },
163
  {
164
- "epoch": 2.898550724637681,
165
- "grad_norm": 0.2801424562931061,
166
- "learning_rate": 4.950116048011739e-06,
167
- "loss": 0.1856,
168
  "step": 1800
169
  },
170
  {
171
- "epoch": 2.898550724637681,
172
- "eval_accuracy": 0.9726688102893891,
173
- "eval_f1": 0.9726543269299354,
174
- "eval_loss": 0.0953439399600029,
175
- "eval_precision": 0.9736809257583791,
176
- "eval_recall": 0.9726688102893891,
177
- "eval_runtime": 5.7237,
178
- "eval_samples_per_second": 108.672,
179
- "eval_steps_per_second": 27.255,
180
  "step": 1800
181
  },
182
  {
183
- "epoch": 3.2206119162640903,
184
- "grad_norm": 0.754078209400177,
185
- "learning_rate": 4.862254033772164e-06,
186
- "loss": 0.1113,
187
  "step": 2000
188
  },
189
  {
190
- "epoch": 3.2206119162640903,
191
- "eval_accuracy": 0.9823151125401929,
192
- "eval_f1": 0.9823107313578804,
193
- "eval_loss": 0.05166807398200035,
194
- "eval_precision": 0.9823692945184218,
195
- "eval_recall": 0.9823151125401929,
196
- "eval_runtime": 5.6315,
197
- "eval_samples_per_second": 110.451,
198
- "eval_steps_per_second": 27.702,
199
  "step": 2000
200
  },
201
  {
202
- "epoch": 3.542673107890499,
203
- "grad_norm": 0.2094874083995819,
204
- "learning_rate": 4.7324160849755856e-06,
205
- "loss": 0.0492,
206
  "step": 2200
207
  },
208
  {
209
- "epoch": 3.542673107890499,
210
- "eval_accuracy": 0.9855305466237942,
211
- "eval_f1": 0.9855078289074475,
212
- "eval_loss": 0.04900892823934555,
213
- "eval_precision": 0.9855788271208662,
214
- "eval_recall": 0.9855305466237942,
215
- "eval_runtime": 5.7859,
216
- "eval_samples_per_second": 107.503,
217
- "eval_steps_per_second": 26.962,
218
  "step": 2200
219
  },
220
  {
221
- "epoch": 3.864734299516908,
222
- "grad_norm": 0.26868194341659546,
223
- "learning_rate": 4.562909349440899e-06,
224
- "loss": 0.0584,
225
  "step": 2400
226
  },
227
  {
228
- "epoch": 3.864734299516908,
229
- "eval_accuracy": 0.9855305466237942,
230
- "eval_f1": 0.9855183714453405,
231
- "eval_loss": 0.03780783340334892,
232
- "eval_precision": 0.9856657355462896,
233
- "eval_recall": 0.9855305466237942,
234
- "eval_runtime": 5.7375,
235
- "eval_samples_per_second": 108.409,
236
- "eval_steps_per_second": 27.189,
237
  "step": 2400
238
  }
239
  ],
240
  "logging_steps": 200,
241
- "max_steps": 6210,
242
  "num_input_tokens_seen": 0,
243
  "num_train_epochs": 10,
244
  "save_steps": 200,
@@ -263,7 +263,7 @@
263
  "attributes": {}
264
  }
265
  },
266
- "total_flos": 383614021649664.0,
267
  "train_batch_size": 4,
268
  "trial_name": null,
269
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.02599843218922615,
3
+ "best_model_checkpoint": "./results4/checkpoint-2400",
4
+ "epoch": 4.4036697247706424,
5
  "eval_steps": 200,
6
  "global_step": 2400,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.3669724770642202,
13
+ "grad_norm": 6.81126594543457,
14
  "learning_rate": 6.666666666666667e-07,
15
+ "loss": 1.8154,
16
  "step": 200
17
  },
18
  {
19
+ "epoch": 0.3669724770642202,
20
+ "eval_accuracy": 0.12834224598930483,
21
+ "eval_f1": 0.053512789620671505,
22
+ "eval_loss": 1.8153961896896362,
23
+ "eval_precision": 0.03873999770058728,
24
+ "eval_recall": 0.12834224598930483,
25
+ "eval_runtime": 8.8051,
26
+ "eval_samples_per_second": 106.189,
27
+ "eval_steps_per_second": 26.576,
28
  "step": 200
29
  },
30
  {
31
+ "epoch": 0.7339449541284404,
32
+ "grad_norm": 9.484223365783691,
33
  "learning_rate": 1.3333333333333334e-06,
34
+ "loss": 1.7996,
35
  "step": 400
36
  },
37
  {
38
+ "epoch": 0.7339449541284404,
39
+ "eval_accuracy": 0.20962566844919786,
40
+ "eval_f1": 0.1699240887979582,
41
+ "eval_loss": 1.776768684387207,
42
+ "eval_precision": 0.2712506960219069,
43
+ "eval_recall": 0.20962566844919786,
44
+ "eval_runtime": 8.617,
45
+ "eval_samples_per_second": 108.506,
46
+ "eval_steps_per_second": 27.155,
47
  "step": 400
48
  },
49
  {
50
+ "epoch": 1.1009174311926606,
51
+ "grad_norm": 8.83604621887207,
52
  "learning_rate": 2.0000000000000003e-06,
53
+ "loss": 1.7653,
54
  "step": 600
55
  },
56
  {
57
+ "epoch": 1.1009174311926606,
58
+ "eval_accuracy": 0.35508021390374334,
59
+ "eval_f1": 0.34776363638784535,
60
+ "eval_loss": 1.7193909883499146,
61
+ "eval_precision": 0.5237977574987844,
62
+ "eval_recall": 0.35508021390374334,
63
+ "eval_runtime": 8.8621,
64
+ "eval_samples_per_second": 105.506,
65
+ "eval_steps_per_second": 26.405,
66
  "step": 600
67
  },
68
  {
69
+ "epoch": 1.4678899082568808,
70
+ "grad_norm": 12.071432113647461,
71
  "learning_rate": 2.666666666666667e-06,
72
+ "loss": 1.7051,
73
  "step": 800
74
  },
75
  {
76
+ "epoch": 1.4678899082568808,
77
+ "eval_accuracy": 0.5401069518716578,
78
+ "eval_f1": 0.5429174295461449,
79
+ "eval_loss": 1.6069858074188232,
80
+ "eval_precision": 0.6148066517946612,
81
+ "eval_recall": 0.5401069518716578,
82
+ "eval_runtime": 8.6363,
83
+ "eval_samples_per_second": 108.263,
84
+ "eval_steps_per_second": 27.095,
85
  "step": 800
86
  },
87
  {
88
+ "epoch": 1.834862385321101,
89
+ "grad_norm": 8.39781379699707,
90
  "learning_rate": 3.3333333333333333e-06,
91
+ "loss": 1.561,
92
  "step": 1000
93
  },
94
  {
95
+ "epoch": 1.834862385321101,
96
+ "eval_accuracy": 0.6181818181818182,
97
+ "eval_f1": 0.599751774424811,
98
+ "eval_loss": 1.3523486852645874,
99
+ "eval_precision": 0.6869881919032953,
100
+ "eval_recall": 0.6181818181818182,
101
+ "eval_runtime": 8.7123,
102
+ "eval_samples_per_second": 107.32,
103
+ "eval_steps_per_second": 26.859,
104
  "step": 1000
105
  },
106
  {
107
+ "epoch": 2.2018348623853212,
108
+ "grad_norm": 7.149374008178711,
109
  "learning_rate": 4.000000000000001e-06,
110
+ "loss": 1.2159,
111
  "step": 1200
112
  },
113
  {
114
+ "epoch": 2.2018348623853212,
115
+ "eval_accuracy": 0.7561497326203208,
116
+ "eval_f1": 0.7396853774240922,
117
+ "eval_loss": 0.8623968958854675,
118
+ "eval_precision": 0.7976461918397214,
119
+ "eval_recall": 0.7561497326203208,
120
+ "eval_runtime": 8.4876,
121
+ "eval_samples_per_second": 110.161,
122
+ "eval_steps_per_second": 27.57,
123
  "step": 1200
124
  },
125
  {
126
+ "epoch": 2.5688073394495414,
127
+ "grad_norm": 12.584464073181152,
128
  "learning_rate": 4.666666666666667e-06,
129
+ "loss": 0.7501,
130
  "step": 1400
131
  },
132
  {
133
+ "epoch": 2.5688073394495414,
134
+ "eval_accuracy": 0.8909090909090909,
135
+ "eval_f1": 0.8893217599642673,
136
+ "eval_loss": 0.43212181329727173,
137
+ "eval_precision": 0.8944228004598542,
138
+ "eval_recall": 0.8909090909090909,
139
+ "eval_runtime": 8.5383,
140
+ "eval_samples_per_second": 109.507,
141
+ "eval_steps_per_second": 27.406,
142
  "step": 1400
143
  },
144
  {
145
+ "epoch": 2.9357798165137616,
146
+ "grad_norm": 6.3582305908203125,
147
+ "learning_rate": 4.99209709753674e-06,
148
+ "loss": 0.4346,
149
  "step": 1600
150
  },
151
  {
152
+ "epoch": 2.9357798165137616,
153
+ "eval_accuracy": 0.9401069518716577,
154
+ "eval_f1": 0.939586410891439,
155
+ "eval_loss": 0.20562343299388885,
156
+ "eval_precision": 0.942115798236324,
157
+ "eval_recall": 0.9401069518716577,
158
+ "eval_runtime": 8.5478,
159
+ "eval_samples_per_second": 109.384,
160
+ "eval_steps_per_second": 27.375,
161
  "step": 1600
162
  },
163
  {
164
+ "epoch": 3.302752293577982,
165
+ "grad_norm": 1.9859445095062256,
166
+ "learning_rate": 4.929173350101025e-06,
167
+ "loss": 0.1985,
168
  "step": 1800
169
  },
170
  {
171
+ "epoch": 3.302752293577982,
172
+ "eval_accuracy": 0.9796791443850268,
173
+ "eval_f1": 0.9795557753030716,
174
+ "eval_loss": 0.07811883836984634,
175
+ "eval_precision": 0.9796698126299838,
176
+ "eval_recall": 0.9796791443850268,
177
+ "eval_runtime": 8.5151,
178
+ "eval_samples_per_second": 109.804,
179
+ "eval_steps_per_second": 27.48,
180
  "step": 1800
181
  },
182
  {
183
+ "epoch": 3.669724770642202,
184
+ "grad_norm": 3.169071912765503,
185
+ "learning_rate": 4.804914636820517e-06,
186
+ "loss": 0.1066,
187
  "step": 2000
188
  },
189
  {
190
+ "epoch": 3.669724770642202,
191
+ "eval_accuracy": 0.9828877005347594,
192
+ "eval_f1": 0.9828341396664676,
193
+ "eval_loss": 0.05222497880458832,
194
+ "eval_precision": 0.9829524348459922,
195
+ "eval_recall": 0.9828877005347594,
196
+ "eval_runtime": 8.3944,
197
+ "eval_samples_per_second": 111.384,
198
+ "eval_steps_per_second": 27.876,
199
  "step": 2000
200
  },
201
  {
202
+ "epoch": 4.036697247706422,
203
+ "grad_norm": 0.22888700664043427,
204
+ "learning_rate": 4.622458405228411e-06,
205
+ "loss": 0.096,
206
  "step": 2200
207
  },
208
  {
209
+ "epoch": 4.036697247706422,
210
+ "eval_accuracy": 0.986096256684492,
211
+ "eval_f1": 0.9860397886588865,
212
+ "eval_loss": 0.037959374487400055,
213
+ "eval_precision": 0.9862011528885352,
214
+ "eval_recall": 0.986096256684492,
215
+ "eval_runtime": 8.8571,
216
+ "eval_samples_per_second": 105.565,
217
+ "eval_steps_per_second": 26.42,
218
  "step": 2200
219
  },
220
  {
221
+ "epoch": 4.4036697247706424,
222
+ "grad_norm": 0.051910221576690674,
223
+ "learning_rate": 4.386411550395576e-06,
224
+ "loss": 0.0686,
225
  "step": 2400
226
  },
227
  {
228
+ "epoch": 4.4036697247706424,
229
+ "eval_accuracy": 0.9925133689839573,
230
+ "eval_f1": 0.9925048378298892,
231
+ "eval_loss": 0.02599843218922615,
232
+ "eval_precision": 0.9925302733753678,
233
+ "eval_recall": 0.9925133689839573,
234
+ "eval_runtime": 8.5262,
235
+ "eval_samples_per_second": 109.661,
236
+ "eval_steps_per_second": 27.445,
237
  "step": 2400
238
  }
239
  ],
240
  "logging_steps": 200,
241
+ "max_steps": 5450,
242
  "num_input_tokens_seen": 0,
243
  "num_train_epochs": 10,
244
  "save_steps": 200,
 
263
  "attributes": {}
264
  }
265
  },
266
+ "total_flos": 396630381488796.0,
267
  "train_batch_size": 4,
268
  "trial_name": null,
269
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c065fc0c8ef5911d83c5bd37ed6bf30478028d63d9361ae1826f528ca5cf4aa
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6482c450a26310e15748f04ef9bf209e56d116e1fe95bfaea393554505069a88
3
  size 5304