kanishka commited on
Commit
92997d0
·
1 Parent(s): 28f3bba

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +12 -12
  2. eval_results.json +7 -7
  3. train_results.json +5 -5
  4. trainer_state.json +112 -124
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.5425975187003966,
4
- "eval_loss": 2.2831156253814697,
5
- "eval_runtime": 4.4985,
6
- "eval_samples": 5038,
7
- "eval_samples_per_second": 1119.928,
8
- "eval_steps_per_second": 2.223,
9
- "perplexity": 9.807188384190734,
10
- "train_loss": 2.5465449372446276,
11
- "train_runtime": 743.7716,
12
- "train_samples": 52812,
13
- "train_samples_per_second": 710.057,
14
- "train_steps_per_second": 11.106
15
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.4861165901356519,
4
+ "eval_loss": 2.557232141494751,
5
+ "eval_runtime": 4.0987,
6
+ "eval_samples": 4491,
7
+ "eval_samples_per_second": 1095.703,
8
+ "eval_steps_per_second": 2.196,
9
+ "perplexity": 12.900062308790385,
10
+ "train_loss": 2.8881424867390284,
11
+ "train_runtime": 618.3653,
12
+ "train_samples": 46845,
13
+ "train_samples_per_second": 757.562,
14
+ "train_steps_per_second": 11.838
15
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.5425975187003966,
4
- "eval_loss": 2.2831156253814697,
5
- "eval_runtime": 4.4985,
6
- "eval_samples": 5038,
7
- "eval_samples_per_second": 1119.928,
8
- "eval_steps_per_second": 2.223,
9
- "perplexity": 9.807188384190734
10
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.4861165901356519,
4
+ "eval_loss": 2.557232141494751,
5
+ "eval_runtime": 4.0987,
6
+ "eval_samples": 4491,
7
+ "eval_samples_per_second": 1095.703,
8
+ "eval_steps_per_second": 2.196,
9
+ "perplexity": 12.900062308790385
10
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "train_loss": 2.5465449372446276,
4
- "train_runtime": 743.7716,
5
- "train_samples": 52812,
6
- "train_samples_per_second": 710.057,
7
- "train_steps_per_second": 11.106
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "train_loss": 2.8881424867390284,
4
+ "train_runtime": 618.3653,
5
+ "train_samples": 46845,
6
+ "train_samples_per_second": 757.562,
7
+ "train_steps_per_second": 11.838
8
  }
trainer_state.json CHANGED
@@ -3,212 +3,200 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 8260,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.61,
13
  "learning_rate": 6.25e-05,
14
- "loss": 5.8023,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 1.0,
19
- "eval_accuracy": 0.4621053223845233,
20
- "eval_loss": 3.109194755554199,
21
- "eval_runtime": 4.0139,
22
- "eval_samples_per_second": 1255.152,
23
- "eval_steps_per_second": 2.491,
24
- "step": 826
25
  },
26
  {
27
- "epoch": 1.21,
28
  "learning_rate": 0.000125,
29
- "loss": 3.1121,
30
  "step": 1000
31
  },
32
  {
33
- "epoch": 1.82,
34
- "learning_rate": 0.0001875,
35
- "loss": 2.7942,
36
- "step": 1500
 
 
 
37
  },
38
  {
39
- "epoch": 2.0,
40
- "eval_accuracy": 0.49748994257813844,
41
- "eval_loss": 2.7389166355133057,
42
- "eval_runtime": 4.1343,
43
- "eval_samples_per_second": 1218.585,
44
- "eval_steps_per_second": 2.419,
45
- "step": 1652
46
  },
47
  {
48
- "epoch": 2.42,
49
  "learning_rate": 0.00025,
50
- "loss": 2.625,
51
  "step": 2000
52
  },
53
  {
54
  "epoch": 3.0,
55
- "eval_accuracy": 0.511015494837659,
56
- "eval_loss": 2.5700840950012207,
57
- "eval_runtime": 4.4436,
58
- "eval_samples_per_second": 1133.756,
59
- "eval_steps_per_second": 2.25,
60
- "step": 2478
61
  },
62
  {
63
- "epoch": 3.03,
64
  "learning_rate": 0.0003125,
65
- "loss": 2.5117,
66
  "step": 2500
67
  },
68
  {
69
- "epoch": 3.63,
70
- "learning_rate": 0.000375,
71
- "loss": 2.412,
72
- "step": 3000
 
 
 
73
  },
74
  {
75
- "epoch": 4.0,
76
- "eval_accuracy": 0.5223295083350786,
77
- "eval_loss": 2.4618635177612305,
78
- "eval_runtime": 4.3105,
79
- "eval_samples_per_second": 1168.783,
80
- "eval_steps_per_second": 2.32,
81
- "step": 3304
82
  },
83
  {
84
- "epoch": 4.24,
85
  "learning_rate": 0.00043750000000000006,
86
- "loss": 2.3434,
87
  "step": 3500
88
  },
89
  {
90
- "epoch": 4.84,
 
 
 
 
 
 
 
 
 
91
  "learning_rate": 0.0005,
92
- "loss": 2.2885,
93
  "step": 4000
94
  },
95
  {
96
- "epoch": 5.0,
97
- "eval_accuracy": 0.5287562556069931,
98
- "eval_loss": 2.3939802646636963,
99
- "eval_runtime": 4.587,
100
- "eval_samples_per_second": 1098.33,
101
- "eval_steps_per_second": 2.18,
102
- "step": 4130
103
  },
104
  {
105
- "epoch": 5.45,
106
  "learning_rate": 0.0005625000000000001,
107
- "loss": 2.2294,
108
  "step": 4500
109
  },
110
  {
111
- "epoch": 6.0,
112
- "eval_accuracy": 0.5341733533804504,
113
- "eval_loss": 2.346377372741699,
114
- "eval_runtime": 4.5113,
115
- "eval_samples_per_second": 1116.745,
116
- "eval_steps_per_second": 2.217,
117
- "step": 4956
118
- },
119
- {
120
- "epoch": 6.05,
121
  "learning_rate": 0.000625,
122
- "loss": 2.2056,
123
  "step": 5000
124
  },
125
  {
126
- "epoch": 6.66,
 
 
 
 
 
 
 
 
 
127
  "learning_rate": 0.0006875,
128
- "loss": 2.16,
129
  "step": 5500
130
  },
131
  {
132
- "epoch": 7.0,
133
- "eval_accuracy": 0.5371960501761417,
134
- "eval_loss": 2.320580005645752,
135
- "eval_runtime": 4.5241,
136
- "eval_samples_per_second": 1113.583,
137
- "eval_steps_per_second": 2.21,
138
- "step": 5782
139
  },
140
  {
141
- "epoch": 7.26,
142
  "learning_rate": 0.00075,
143
- "loss": 2.1379,
144
  "step": 6000
145
  },
146
  {
147
- "epoch": 7.87,
148
  "learning_rate": 0.0008125,
149
- "loss": 2.1272,
150
  "step": 6500
151
  },
152
  {
153
- "epoch": 8.0,
154
- "eval_accuracy": 0.5394685430101309,
155
- "eval_loss": 2.3045718669891357,
156
- "eval_runtime": 4.5162,
157
- "eval_samples_per_second": 1115.551,
158
- "eval_steps_per_second": 2.214,
159
- "step": 6608
160
  },
161
  {
162
- "epoch": 8.47,
163
  "learning_rate": 0.0008750000000000001,
164
- "loss": 2.0865,
165
  "step": 7000
166
  },
167
- {
168
- "epoch": 9.0,
169
- "eval_accuracy": 0.540476629583668,
170
- "eval_loss": 2.291161298751831,
171
- "eval_runtime": 4.4842,
172
- "eval_samples_per_second": 1123.503,
173
- "eval_steps_per_second": 2.23,
174
- "step": 7434
175
- },
176
- {
177
- "epoch": 9.08,
178
- "learning_rate": 0.0009375,
179
- "loss": 2.0927,
180
- "step": 7500
181
- },
182
- {
183
- "epoch": 9.69,
184
- "learning_rate": 0.001,
185
- "loss": 2.0577,
186
- "step": 8000
187
- },
188
  {
189
  "epoch": 10.0,
190
- "eval_accuracy": 0.5425975187003966,
191
- "eval_loss": 2.2831156253814697,
192
- "eval_runtime": 4.7038,
193
- "eval_samples_per_second": 1071.043,
194
- "eval_steps_per_second": 2.126,
195
- "step": 8260
196
  },
197
  {
198
  "epoch": 10.0,
199
- "step": 8260,
200
- "total_flos": 2562796651806720.0,
201
- "train_loss": 2.5465449372446276,
202
- "train_runtime": 743.7716,
203
- "train_samples_per_second": 710.057,
204
- "train_steps_per_second": 11.106
205
  }
206
  ],
207
  "logging_steps": 500,
208
- "max_steps": 8260,
209
  "num_train_epochs": 10,
210
  "save_steps": 2000,
211
- "total_flos": 2562796651806720.0,
212
  "trial_name": null,
213
  "trial_params": null
214
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 7320,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.68,
13
  "learning_rate": 6.25e-05,
14
+ "loss": 6.1167,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 1.0,
19
+ "eval_accuracy": 0.39470717462922345,
20
+ "eval_loss": 3.535817861557007,
21
+ "eval_runtime": 3.8077,
22
+ "eval_samples_per_second": 1179.442,
23
+ "eval_steps_per_second": 2.364,
24
+ "step": 732
25
  },
26
  {
27
+ "epoch": 1.37,
28
  "learning_rate": 0.000125,
29
+ "loss": 3.4801,
30
  "step": 1000
31
  },
32
  {
33
+ "epoch": 2.0,
34
+ "eval_accuracy": 0.43199785397566787,
35
+ "eval_loss": 3.1201677322387695,
36
+ "eval_runtime": 3.9073,
37
+ "eval_samples_per_second": 1149.375,
38
+ "eval_steps_per_second": 2.303,
39
+ "step": 1464
40
  },
41
  {
42
+ "epoch": 2.05,
43
+ "learning_rate": 0.0001875,
44
+ "loss": 3.1329,
45
+ "step": 1500
 
 
 
46
  },
47
  {
48
+ "epoch": 2.73,
49
  "learning_rate": 0.00025,
50
+ "loss": 2.9429,
51
  "step": 2000
52
  },
53
  {
54
  "epoch": 3.0,
55
+ "eval_accuracy": 0.44977794609341165,
56
+ "eval_loss": 2.915814161300659,
57
+ "eval_runtime": 4.1629,
58
+ "eval_samples_per_second": 1078.826,
59
+ "eval_steps_per_second": 2.162,
60
+ "step": 2196
61
  },
62
  {
63
+ "epoch": 3.42,
64
  "learning_rate": 0.0003125,
65
+ "loss": 2.8071,
66
  "step": 2500
67
  },
68
  {
69
+ "epoch": 4.0,
70
+ "eval_accuracy": 0.460516834193321,
71
+ "eval_loss": 2.790581464767456,
72
+ "eval_runtime": 4.0203,
73
+ "eval_samples_per_second": 1117.085,
74
+ "eval_steps_per_second": 2.239,
75
+ "step": 2928
76
  },
77
  {
78
+ "epoch": 4.1,
79
+ "learning_rate": 0.000375,
80
+ "loss": 2.7064,
81
+ "step": 3000
 
 
 
82
  },
83
  {
84
+ "epoch": 4.78,
85
  "learning_rate": 0.00043750000000000006,
86
+ "loss": 2.6197,
87
  "step": 3500
88
  },
89
  {
90
+ "epoch": 5.0,
91
+ "eval_accuracy": 0.4701406312186929,
92
+ "eval_loss": 2.6998250484466553,
93
+ "eval_runtime": 4.0911,
94
+ "eval_samples_per_second": 1097.737,
95
+ "eval_steps_per_second": 2.2,
96
+ "step": 3660
97
+ },
98
+ {
99
+ "epoch": 5.46,
100
  "learning_rate": 0.0005,
101
+ "loss": 2.5459,
102
  "step": 4000
103
  },
104
  {
105
+ "epoch": 6.0,
106
+ "eval_accuracy": 0.4759738199057783,
107
+ "eval_loss": 2.641901969909668,
108
+ "eval_runtime": 4.078,
109
+ "eval_samples_per_second": 1101.267,
110
+ "eval_steps_per_second": 2.207,
111
+ "step": 4392
112
  },
113
  {
114
+ "epoch": 6.15,
115
  "learning_rate": 0.0005625000000000001,
116
+ "loss": 2.5015,
117
  "step": 4500
118
  },
119
  {
120
+ "epoch": 6.83,
 
 
 
 
 
 
 
 
 
121
  "learning_rate": 0.000625,
122
+ "loss": 2.4492,
123
  "step": 5000
124
  },
125
  {
126
+ "epoch": 7.0,
127
+ "eval_accuracy": 0.4802465824036524,
128
+ "eval_loss": 2.6036274433135986,
129
+ "eval_runtime": 4.1331,
130
+ "eval_samples_per_second": 1086.595,
131
+ "eval_steps_per_second": 2.178,
132
+ "step": 5124
133
+ },
134
+ {
135
+ "epoch": 7.51,
136
  "learning_rate": 0.0006875,
137
+ "loss": 2.4065,
138
  "step": 5500
139
  },
140
  {
141
+ "epoch": 8.0,
142
+ "eval_accuracy": 0.4823891001600752,
143
+ "eval_loss": 2.5769591331481934,
144
+ "eval_runtime": 3.9166,
145
+ "eval_samples_per_second": 1146.648,
146
+ "eval_steps_per_second": 2.298,
147
+ "step": 5856
148
  },
149
  {
150
+ "epoch": 8.2,
151
  "learning_rate": 0.00075,
152
+ "loss": 2.3867,
153
  "step": 6000
154
  },
155
  {
156
+ "epoch": 8.88,
157
  "learning_rate": 0.0008125,
158
+ "loss": 2.3626,
159
  "step": 6500
160
  },
161
  {
162
+ "epoch": 9.0,
163
+ "eval_accuracy": 0.4863024386480748,
164
+ "eval_loss": 2.5622453689575195,
165
+ "eval_runtime": 3.9696,
166
+ "eval_samples_per_second": 1131.334,
167
+ "eval_steps_per_second": 2.267,
168
+ "step": 6588
169
  },
170
  {
171
+ "epoch": 9.56,
172
  "learning_rate": 0.0008750000000000001,
173
+ "loss": 2.3276,
174
  "step": 7000
175
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  {
177
  "epoch": 10.0,
178
+ "eval_accuracy": 0.4861165901356519,
179
+ "eval_loss": 2.557232141494751,
180
+ "eval_runtime": 4.198,
181
+ "eval_samples_per_second": 1069.803,
182
+ "eval_steps_per_second": 2.144,
183
+ "step": 7320
184
  },
185
  {
186
  "epoch": 10.0,
187
+ "step": 7320,
188
+ "total_flos": 2273237316403200.0,
189
+ "train_loss": 2.8881424867390284,
190
+ "train_runtime": 618.3653,
191
+ "train_samples_per_second": 757.562,
192
+ "train_steps_per_second": 11.838
193
  }
194
  ],
195
  "logging_steps": 500,
196
+ "max_steps": 7320,
197
  "num_train_epochs": 10,
198
  "save_steps": 2000,
199
+ "total_flos": 2273237316403200.0,
200
  "trial_name": null,
201
  "trial_params": null
202
  }