recepbulbul commited on
Commit
e2e324f
1 Parent(s): 1c5cd59

Model daha fazla veri ile yeniden optimize edildi

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +90 -202
  6. training_args.bin +2 -2
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e39da7665c37f20b94b26e31013d0e7acb670a7c74118d111cd44ddd1f15adab
3
  size 2477521472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ae60214b33882a057a639ec4a80e96b390320019843720440e62ed048d966f
3
  size 2477521472
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27a1cc8e1d26188ecee29d6fd87211e606a652a58f617330497529cfa4e0a358
3
  size 4955506101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a62f248e0eb1d0200aebd2a333194203aaa205a130aff2ff222f3a2df6e8f54
3
  size 4955506101
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:269dbf2213fadf2fab87d4e6cb9754109bbb43bf28ee2ee952a420ea0870b34f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fabe990932866923517fededfb8def95df36b72db0e74bc97b6f5d0f3574b81e
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b097d732cba2ad47b9977234e3349f0a6ee2c416c7a8fb6f99d6ff0b3dd99027
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b954775169ae5b7aa230f7be73e03f3869c27dd6f4a8086af73720d31b5722b5
3
  size 1064
trainer_state.json CHANGED
@@ -3,256 +3,144 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 15010,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.1665556295802798,
13
- "grad_norm": 1.6862213611602783,
14
- "learning_rate": 0.00048334443704197203,
15
- "loss": 2.122,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.3331112591605596,
20
- "grad_norm": 0.9895781874656677,
21
- "learning_rate": 0.00046668887408394405,
22
- "loss": 1.6754,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 0.4996668887408394,
27
- "grad_norm": 1.4011154174804688,
28
- "learning_rate": 0.00045003331112591607,
29
- "loss": 1.5258,
 
 
 
 
 
 
 
 
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 0.6662225183211192,
34
- "grad_norm": 1.263901948928833,
35
- "learning_rate": 0.0004333777481678881,
36
- "loss": 1.3949,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 0.832778147901399,
41
- "grad_norm": 0.9926736354827881,
42
- "learning_rate": 0.0004167221852098601,
43
- "loss": 1.323,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 0.9993337774816788,
48
- "grad_norm": 1.2104265689849854,
49
- "learning_rate": 0.0004000666222518321,
50
- "loss": 1.2978,
51
- "step": 3000
 
52
  },
53
  {
54
- "epoch": 1.0,
55
- "eval_loss": 1.1156065464019775,
56
- "eval_runtime": 37.2553,
57
- "eval_samples_per_second": 80.579,
58
- "eval_steps_per_second": 10.093,
59
- "step": 3002
60
  },
61
  {
62
- "epoch": 1.1658894070619588,
63
- "grad_norm": 0.9220499992370605,
64
- "learning_rate": 0.00038341105929380414,
65
- "loss": 0.964,
66
  "step": 3500
67
  },
68
  {
69
- "epoch": 1.3324450366422385,
70
- "grad_norm": 1.2684720754623413,
71
- "learning_rate": 0.00036675549633577616,
72
- "loss": 0.9784,
73
  "step": 4000
74
  },
75
  {
76
- "epoch": 1.4990006662225184,
77
- "grad_norm": 1.3914306163787842,
78
- "learning_rate": 0.0003500999333777481,
79
- "loss": 0.9265,
 
 
 
 
 
 
 
 
80
  "step": 4500
81
  },
82
  {
83
- "epoch": 1.6655562958027983,
84
- "grad_norm": 1.0393187999725342,
85
- "learning_rate": 0.0003334443704197202,
86
- "loss": 0.9401,
87
  "step": 5000
88
  },
89
  {
90
- "epoch": 1.832111925383078,
91
- "grad_norm": 1.1595275402069092,
92
- "learning_rate": 0.0003167888074616922,
93
- "loss": 0.9091,
94
  "step": 5500
95
  },
96
  {
97
- "epoch": 1.9986675549633577,
98
- "grad_norm": 1.2401949167251587,
99
- "learning_rate": 0.00030013324450366423,
100
- "loss": 0.9271,
101
- "step": 6000
 
102
  },
103
  {
104
- "epoch": 2.0,
105
- "eval_loss": 0.9488099217414856,
106
- "eval_runtime": 37.415,
107
- "eval_samples_per_second": 80.235,
108
- "eval_steps_per_second": 10.049,
109
- "step": 6004
110
  },
111
  {
112
- "epoch": 2.1652231845436374,
113
- "grad_norm": 0.6324372887611389,
114
- "learning_rate": 0.00028347768154563625,
115
- "loss": 0.6576,
116
  "step": 6500
117
  },
118
  {
119
- "epoch": 2.3317788141239175,
120
- "grad_norm": 1.377943992614746,
121
- "learning_rate": 0.00026682211858760827,
122
- "loss": 0.6499,
123
  "step": 7000
124
- },
125
- {
126
- "epoch": 2.498334443704197,
127
- "grad_norm": 0.9929794669151306,
128
- "learning_rate": 0.0002501665556295803,
129
- "loss": 0.6509,
130
- "step": 7500
131
- },
132
- {
133
- "epoch": 2.664890073284477,
134
- "grad_norm": 1.331009030342102,
135
- "learning_rate": 0.0002335109926715523,
136
- "loss": 0.6492,
137
- "step": 8000
138
- },
139
- {
140
- "epoch": 2.831445702864757,
141
- "grad_norm": 0.9260538816452026,
142
- "learning_rate": 0.00021685542971352432,
143
- "loss": 0.6765,
144
- "step": 8500
145
- },
146
- {
147
- "epoch": 2.9980013324450367,
148
- "grad_norm": 1.6844342947006226,
149
- "learning_rate": 0.00020019986675549634,
150
- "loss": 0.6452,
151
- "step": 9000
152
- },
153
- {
154
- "epoch": 3.0,
155
- "eval_loss": 0.9022971391677856,
156
- "eval_runtime": 37.548,
157
- "eval_samples_per_second": 79.951,
158
- "eval_steps_per_second": 10.014,
159
- "step": 9006
160
- },
161
- {
162
- "epoch": 3.1645569620253164,
163
- "grad_norm": 0.6371086835861206,
164
- "learning_rate": 0.00018354430379746836,
165
- "loss": 0.4646,
166
- "step": 9500
167
- },
168
- {
169
- "epoch": 3.331112591605596,
170
- "grad_norm": 1.005698323249817,
171
- "learning_rate": 0.00016688874083944038,
172
- "loss": 0.4706,
173
- "step": 10000
174
- },
175
- {
176
- "epoch": 3.497668221185876,
177
- "grad_norm": 0.990774393081665,
178
- "learning_rate": 0.0001502331778814124,
179
- "loss": 0.434,
180
- "step": 10500
181
- },
182
- {
183
- "epoch": 3.664223850766156,
184
- "grad_norm": 0.7444645762443542,
185
- "learning_rate": 0.00013357761492338441,
186
- "loss": 0.4682,
187
- "step": 11000
188
- },
189
- {
190
- "epoch": 3.8307794803464357,
191
- "grad_norm": 1.1938289403915405,
192
- "learning_rate": 0.00011692205196535643,
193
- "loss": 0.4428,
194
- "step": 11500
195
- },
196
- {
197
- "epoch": 3.9973351099267154,
198
- "grad_norm": 1.2232627868652344,
199
- "learning_rate": 0.00010026648900732845,
200
- "loss": 0.4446,
201
- "step": 12000
202
- },
203
- {
204
- "epoch": 4.0,
205
- "eval_loss": 0.9124976396560669,
206
- "eval_runtime": 37.6584,
207
- "eval_samples_per_second": 79.717,
208
- "eval_steps_per_second": 9.985,
209
- "step": 12008
210
- },
211
- {
212
- "epoch": 4.1638907395069955,
213
- "grad_norm": 1.035305380821228,
214
- "learning_rate": 8.361092604930047e-05,
215
- "loss": 0.3292,
216
- "step": 12500
217
- },
218
- {
219
- "epoch": 4.330446369087275,
220
- "grad_norm": 1.409875512123108,
221
- "learning_rate": 6.695536309127249e-05,
222
- "loss": 0.321,
223
- "step": 13000
224
- },
225
- {
226
- "epoch": 4.497001998667555,
227
- "grad_norm": 1.468259334564209,
228
- "learning_rate": 5.0299800133244506e-05,
229
- "loss": 0.3161,
230
- "step": 13500
231
- },
232
- {
233
- "epoch": 4.663557628247835,
234
- "grad_norm": 1.00761878490448,
235
- "learning_rate": 3.3644237175216524e-05,
236
- "loss": 0.3214,
237
- "step": 14000
238
- },
239
- {
240
- "epoch": 4.830113257828114,
241
- "grad_norm": 0.7190210223197937,
242
- "learning_rate": 1.698867421718854e-05,
243
- "loss": 0.3037,
244
- "step": 14500
245
- },
246
- {
247
- "epoch": 4.996668887408394,
248
- "grad_norm": 0.6449595093727112,
249
- "learning_rate": 3.3311125916055966e-07,
250
- "loss": 0.3045,
251
- "step": 15000
252
  }
253
  ],
254
  "logging_steps": 500,
255
- "max_steps": 15010,
256
  "num_input_tokens_seen": 0,
257
  "num_train_epochs": 5,
258
  "save_steps": 10000,
@@ -268,8 +156,8 @@
268
  "attributes": {}
269
  }
270
  },
271
- "total_flos": 6083104659545088.0,
272
- "train_batch_size": 4,
273
  "trial_name": null,
274
  "trial_params": null
275
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 7430,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.3364737550471063,
13
+ "grad_norm": 0.923697829246521,
14
+ "learning_rate": 0.00046635262449528937,
15
+ "loss": 1.9724,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.6729475100942126,
20
+ "grad_norm": 0.9243040084838867,
21
+ "learning_rate": 0.0004327052489905787,
22
+ "loss": 1.4423,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 1.0,
27
+ "eval_loss": 1.0461400747299194,
28
+ "eval_runtime": 36.2574,
29
+ "eval_samples_per_second": 81.942,
30
+ "eval_steps_per_second": 10.26,
31
+ "step": 1486
32
+ },
33
+ {
34
+ "epoch": 1.009421265141319,
35
+ "grad_norm": 0.9555139541625977,
36
+ "learning_rate": 0.0003990578734858681,
37
+ "loss": 1.2854,
38
  "step": 1500
39
  },
40
  {
41
+ "epoch": 1.3458950201884252,
42
+ "grad_norm": 0.8507774472236633,
43
+ "learning_rate": 0.0003654104979811575,
44
+ "loss": 0.9929,
45
  "step": 2000
46
  },
47
  {
48
+ "epoch": 1.6823687752355316,
49
+ "grad_norm": 1.1206731796264648,
50
+ "learning_rate": 0.00033176312247644685,
51
+ "loss": 0.9408,
52
  "step": 2500
53
  },
54
  {
55
+ "epoch": 2.0,
56
+ "eval_loss": 0.9026183485984802,
57
+ "eval_runtime": 36.3158,
58
+ "eval_samples_per_second": 81.81,
59
+ "eval_steps_per_second": 10.243,
60
+ "step": 2972
61
  },
62
  {
63
+ "epoch": 2.018842530282638,
64
+ "grad_norm": 0.7318525910377502,
65
+ "learning_rate": 0.0002981157469717362,
66
+ "loss": 0.8886,
67
+ "step": 3000
 
68
  },
69
  {
70
+ "epoch": 2.3553162853297445,
71
+ "grad_norm": 1.1639642715454102,
72
+ "learning_rate": 0.00026446837146702556,
73
+ "loss": 0.6969,
74
  "step": 3500
75
  },
76
  {
77
+ "epoch": 2.6917900403768504,
78
+ "grad_norm": 0.7347049117088318,
79
+ "learning_rate": 0.00023082099596231497,
80
+ "loss": 0.692,
81
  "step": 4000
82
  },
83
  {
84
+ "epoch": 3.0,
85
+ "eval_loss": 0.8661695122718811,
86
+ "eval_runtime": 36.5469,
87
+ "eval_samples_per_second": 81.293,
88
+ "eval_steps_per_second": 10.179,
89
+ "step": 4458
90
+ },
91
+ {
92
+ "epoch": 3.028263795423957,
93
+ "grad_norm": 0.7746924757957458,
94
+ "learning_rate": 0.00019717362045760433,
95
+ "loss": 0.6564,
96
  "step": 4500
97
  },
98
  {
99
+ "epoch": 3.3647375504710633,
100
+ "grad_norm": 0.7316901087760925,
101
+ "learning_rate": 0.00016352624495289368,
102
+ "loss": 0.4934,
103
  "step": 5000
104
  },
105
  {
106
+ "epoch": 3.7012113055181697,
107
+ "grad_norm": 0.9040531516075134,
108
+ "learning_rate": 0.00012987886944818307,
109
+ "loss": 0.5261,
110
  "step": 5500
111
  },
112
  {
113
+ "epoch": 4.0,
114
+ "eval_loss": 0.8571327924728394,
115
+ "eval_runtime": 36.3399,
116
+ "eval_samples_per_second": 81.756,
117
+ "eval_steps_per_second": 10.237,
118
+ "step": 5944
119
  },
120
  {
121
+ "epoch": 4.037685060565276,
122
+ "grad_norm": 0.9058707356452942,
123
+ "learning_rate": 9.623149394347241e-05,
124
+ "loss": 0.4785,
125
+ "step": 6000
 
126
  },
127
  {
128
+ "epoch": 4.3741588156123825,
129
+ "grad_norm": 0.7362410426139832,
130
+ "learning_rate": 6.258411843876178e-05,
131
+ "loss": 0.3714,
132
  "step": 6500
133
  },
134
  {
135
+ "epoch": 4.710632570659489,
136
+ "grad_norm": 0.6890231370925903,
137
+ "learning_rate": 2.8936742934051144e-05,
138
+ "loss": 0.3846,
139
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  }
141
  ],
142
  "logging_steps": 500,
143
+ "max_steps": 7430,
144
  "num_input_tokens_seen": 0,
145
  "num_train_epochs": 5,
146
  "save_steps": 10000,
 
156
  "attributes": {}
157
  }
158
  },
159
+ "total_flos": 7016439606285312.0,
160
+ "train_batch_size": 8,
161
  "trial_name": null,
162
  "trial_params": null
163
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bc3740d66ce7f13a9bf52c0f372ab07710a244dec0c4cce502fad110e30edb3
3
- size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:830ccd11e7a4311136c7f354d64d3e6fe2cf261f2013d520ce30d43e50e1e5c4
3
+ size 5112