Qin Liu commited on
Commit
acc2564
1 Parent(s): 6dca8e1

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,8 @@
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
3
- datasets:
4
- - HuggingFaceH4/ultrachat_200k
5
  library_name: peft
6
  license: llama3
7
  tags:
8
- - alignment-handbook
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -19,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # llama3-sudo-sanity
21
 
22
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the HuggingFaceH4/ultrachat_200k dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.7491
25
 
26
  ## Model description
27
 
@@ -52,15 +49,22 @@ The following hyperparameters were used during training:
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 3
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:------:|:----:|:---------------:|
61
- | 1.8955 | 0.9899 | 49 | 1.8463 |
62
- | 1.8698 | 2.0 | 99 | 1.7678 |
63
- | 1.8282 | 2.9697 | 147 | 1.7491 |
 
 
 
 
 
 
 
64
 
65
 
66
  ### Framework versions
 
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
 
 
3
  library_name: peft
4
  license: llama3
5
  tags:
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
16
 
17
  # llama3-sudo-sanity
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 1.1030
22
 
23
  ## Model description
24
 
 
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 10
53
 
54
  ### Training results
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:------:|:----:|:---------------:|
58
+ | 1.8735 | 0.9899 | 49 | 1.8325 |
59
+ | 1.8231 | 2.0 | 99 | 1.7239 |
60
+ | 1.7516 | 2.9899 | 148 | 1.6330 |
61
+ | 1.6586 | 4.0 | 198 | 1.5280 |
62
+ | 1.5571 | 4.9899 | 247 | 1.4166 |
63
+ | 1.4677 | 6.0 | 297 | 1.3068 |
64
+ | 1.3422 | 6.9899 | 346 | 1.2082 |
65
+ | 1.2609 | 8.0 | 396 | 1.1378 |
66
+ | 1.1647 | 8.9899 | 445 | 1.1074 |
67
+ | 1.1571 | 9.8990 | 490 | 1.1030 |
68
 
69
 
70
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.9696969696969697,
3
- "total_flos": 706816481427456.0,
4
- "train_loss": 1.9096906704156578,
5
- "train_runtime": 1619.2688,
6
  "train_samples": 6321,
7
- "train_samples_per_second": 11.711,
8
  "train_steps_per_second": 0.091
9
  }
 
1
  {
2
+ "epoch": 9.8989898989899,
3
+ "total_flos": 2344635780825088.0,
4
+ "train_loss": 1.5290005391957808,
5
+ "train_runtime": 5413.3076,
6
  "train_samples": 6321,
7
+ "train_samples_per_second": 11.677,
8
  "train_steps_per_second": 0.091
9
  }
runs/Aug21_02-51-44_ip-172-31-10-237/events.out.tfevents.1724208715.ip-172-31-10-237.960579.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e5ed61f02b2dde45022f0805f6821c207a3231eb7876c3663dc764a5e92cbc8
3
- size 28793
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:658ebebff6d9c7b4b26c19b8f53502f8bcef7731700fb8153bb4498d3dfd5fb4
3
+ size 29418
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.9696969696969697,
3
- "total_flos": 706816481427456.0,
4
- "train_loss": 1.9096906704156578,
5
- "train_runtime": 1619.2688,
6
  "train_samples": 6321,
7
- "train_samples_per_second": 11.711,
8
  "train_steps_per_second": 0.091
9
  }
 
1
  {
2
+ "epoch": 9.8989898989899,
3
+ "total_flos": 2344635780825088.0,
4
+ "train_loss": 1.5290005391957808,
5
+ "train_runtime": 5413.3076,
6
  "train_samples": 6321,
7
+ "train_samples_per_second": 11.677,
8
  "train_steps_per_second": 0.091
9
  }
trainer_state.json CHANGED
@@ -1,261 +1,800 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.9696969696969697,
5
  "eval_steps": 500,
6
- "global_step": 147,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.020202020202020204,
13
- "grad_norm": 1.08310938404347,
14
- "learning_rate": 1.3333333333333333e-05,
15
- "loss": 2.5976,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.10101010101010101,
20
- "grad_norm": 1.2015655639458453,
21
- "learning_rate": 6.666666666666667e-05,
22
- "loss": 2.5737,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.20202020202020202,
27
- "grad_norm": 0.5529484610069302,
28
- "learning_rate": 0.00013333333333333334,
29
- "loss": 2.4242,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.30303030303030304,
34
- "grad_norm": 0.44235368767500005,
35
- "learning_rate": 0.0002,
36
- "loss": 2.2287,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.40404040404040403,
41
- "grad_norm": 0.34186780209717693,
42
- "learning_rate": 0.00019929278846732884,
43
- "loss": 2.1199,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.5050505050505051,
48
- "grad_norm": 0.2690393065112005,
49
- "learning_rate": 0.00019718115683235417,
50
- "loss": 1.963,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.6060606060606061,
55
- "grad_norm": 0.2895526654213361,
56
- "learning_rate": 0.0001936949724999762,
57
- "loss": 1.9819,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.7070707070707071,
62
- "grad_norm": 0.21284939456784369,
63
- "learning_rate": 0.00018888354486549237,
64
- "loss": 1.9367,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.8080808080808081,
69
- "grad_norm": 0.220661039014496,
70
- "learning_rate": 0.00018281492787113708,
71
- "loss": 1.9123,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.9090909090909091,
76
- "grad_norm": 0.25800435226254403,
77
- "learning_rate": 0.00017557495743542585,
78
- "loss": 1.8955,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.98989898989899,
83
- "eval_loss": 1.8462597131729126,
84
- "eval_runtime": 164.2887,
85
- "eval_samples_per_second": 38.475,
86
- "eval_steps_per_second": 2.41,
87
  "step": 49
88
  },
89
  {
90
  "epoch": 1.0101010101010102,
91
- "grad_norm": 0.21280470872825646,
92
- "learning_rate": 0.00016726603737012529,
93
- "loss": 1.9023,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.1111111111111112,
98
- "grad_norm": 0.2281964993767009,
99
- "learning_rate": 0.00015800569095711982,
100
- "loss": 1.8291,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.2121212121212122,
105
- "grad_norm": 0.2468620521645798,
106
- "learning_rate": 0.0001479248986720057,
107
- "loss": 1.8448,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.3131313131313131,
112
- "grad_norm": 0.24256005211247963,
113
- "learning_rate": 0.00013716624556603274,
114
- "loss": 1.8667,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.4141414141414141,
119
- "grad_norm": 0.22252607581763362,
120
- "learning_rate": 0.00012588190451025207,
121
- "loss": 1.8213,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.5151515151515151,
126
- "grad_norm": 0.2742840702900069,
127
- "learning_rate": 0.00011423148382732853,
128
- "loss": 1.8122,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.6161616161616161,
133
- "grad_norm": 0.25494953140215326,
134
- "learning_rate": 0.00010237976975461075,
135
- "loss": 1.8726,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.7171717171717171,
140
- "grad_norm": 0.2657201331656562,
141
- "learning_rate": 9.049439566958175e-05,
142
- "loss": 1.8425,
143
  "step": 85
144
  },
145
  {
146
  "epoch": 1.8181818181818183,
147
- "grad_norm": 0.2700733549650734,
148
- "learning_rate": 7.874347104470234e-05,
149
- "loss": 1.8205,
150
  "step": 90
151
  },
152
  {
153
  "epoch": 1.9191919191919191,
154
- "grad_norm": 0.28284460248100685,
155
- "learning_rate": 6.729320366825784e-05,
156
- "loss": 1.8698,
157
  "step": 95
158
  },
159
  {
160
  "epoch": 2.0,
161
- "eval_loss": 1.7677603960037231,
162
- "eval_runtime": 177.7383,
163
- "eval_samples_per_second": 35.564,
164
- "eval_steps_per_second": 2.228,
165
  "step": 99
166
  },
167
  {
168
  "epoch": 2.0202020202020203,
169
- "grad_norm": 0.28052694585968674,
170
- "learning_rate": 5.630554876306407e-05,
171
- "loss": 1.8412,
172
  "step": 100
173
  },
174
  {
175
  "epoch": 2.121212121212121,
176
- "grad_norm": 0.30253313940579424,
177
- "learning_rate": 4.593591825444028e-05,
178
- "loss": 1.7843,
179
  "step": 105
180
  },
181
  {
182
  "epoch": 2.2222222222222223,
183
- "grad_norm": 0.2919901233198437,
184
- "learning_rate": 3.6330982588091186e-05,
185
- "loss": 1.8085,
186
  "step": 110
187
  },
188
  {
189
  "epoch": 2.323232323232323,
190
- "grad_norm": 0.2985415087687047,
191
- "learning_rate": 2.7626596189492983e-05,
192
- "loss": 1.7548,
193
  "step": 115
194
  },
195
  {
196
  "epoch": 2.4242424242424243,
197
- "grad_norm": 0.31809894547826195,
198
- "learning_rate": 1.994587590756397e-05,
199
- "loss": 1.758,
200
  "step": 120
201
  },
202
  {
203
  "epoch": 2.525252525252525,
204
- "grad_norm": 0.31151542636398494,
205
- "learning_rate": 1.339745962155613e-05,
206
- "loss": 1.7844,
207
  "step": 125
208
  },
209
  {
210
  "epoch": 2.6262626262626263,
211
- "grad_norm": 0.3034143797714973,
212
- "learning_rate": 8.073969641833445e-06,
213
- "loss": 1.7975,
214
  "step": 130
215
  },
216
  {
217
  "epoch": 2.7272727272727275,
218
- "grad_norm": 0.28778283893396456,
219
- "learning_rate": 4.050702638550275e-06,
220
- "loss": 1.7661,
221
  "step": 135
222
  },
223
  {
224
  "epoch": 2.8282828282828283,
225
- "grad_norm": 0.2907166952704868,
226
- "learning_rate": 1.3845646281813507e-06,
227
- "loss": 1.7619,
228
  "step": 140
229
  },
230
  {
231
  "epoch": 2.929292929292929,
232
- "grad_norm": 0.31772634861484544,
233
- "learning_rate": 1.1326608169920372e-07,
234
- "loss": 1.8282,
235
  "step": 145
236
  },
237
  {
238
- "epoch": 2.9696969696969697,
239
- "eval_loss": 1.7491472959518433,
240
- "eval_runtime": 161.8737,
241
- "eval_samples_per_second": 39.049,
242
- "eval_steps_per_second": 2.446,
243
- "step": 147
244
  },
245
  {
246
- "epoch": 2.9696969696969697,
247
- "step": 147,
248
- "total_flos": 706816481427456.0,
249
- "train_loss": 1.9096906704156578,
250
- "train_runtime": 1619.2688,
251
- "train_samples_per_second": 11.711,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  "train_steps_per_second": 0.091
253
  }
254
  ],
255
  "logging_steps": 5,
256
- "max_steps": 147,
257
  "num_input_tokens_seen": 0,
258
- "num_train_epochs": 3,
259
  "save_steps": 25,
260
  "stateful_callbacks": {
261
  "TrainerControl": {
@@ -269,7 +808,7 @@
269
  "attributes": {}
270
  }
271
  },
272
- "total_flos": 706816481427456.0,
273
  "train_batch_size": 8,
274
  "trial_name": null,
275
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.8989898989899,
5
  "eval_steps": 500,
6
+ "global_step": 490,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.020202020202020204,
13
+ "grad_norm": 1.1391378054420909,
14
+ "learning_rate": 4.081632653061224e-06,
15
+ "loss": 2.5995,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.10101010101010101,
20
+ "grad_norm": 1.006731604503432,
21
+ "learning_rate": 2.0408163265306123e-05,
22
+ "loss": 2.5925,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.20202020202020202,
27
+ "grad_norm": 1.3965084950072466,
28
+ "learning_rate": 4.0816326530612245e-05,
29
+ "loss": 2.546,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.30303030303030304,
34
+ "grad_norm": 0.59095362852847,
35
+ "learning_rate": 6.122448979591838e-05,
36
+ "loss": 2.396,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.40404040404040403,
41
+ "grad_norm": 0.33359551466461584,
42
+ "learning_rate": 8.163265306122449e-05,
43
+ "loss": 2.2744,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.5050505050505051,
48
+ "grad_norm": 0.3767673243983956,
49
+ "learning_rate": 0.00010204081632653062,
50
+ "loss": 2.1608,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.6060606060606061,
55
+ "grad_norm": 0.3530777092096336,
56
+ "learning_rate": 0.00012244897959183676,
57
+ "loss": 2.0261,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.7070707070707071,
62
+ "grad_norm": 0.36168305575388426,
63
+ "learning_rate": 0.00014285714285714287,
64
+ "loss": 2.0091,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.8080808080808081,
69
+ "grad_norm": 0.2764545304686734,
70
+ "learning_rate": 0.00016326530612244898,
71
+ "loss": 1.9434,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.9090909090909091,
76
+ "grad_norm": 0.2653033039849191,
77
+ "learning_rate": 0.00018367346938775512,
78
+ "loss": 1.8735,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.98989898989899,
83
+ "eval_loss": 1.8325327634811401,
84
+ "eval_runtime": 177.2293,
85
+ "eval_samples_per_second": 35.666,
86
+ "eval_steps_per_second": 2.234,
87
  "step": 49
88
  },
89
  {
90
  "epoch": 1.0101010101010102,
91
+ "grad_norm": 0.2170406034599688,
92
+ "learning_rate": 0.00019999746258949147,
93
+ "loss": 1.8679,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.1111111111111112,
98
+ "grad_norm": 0.2414932638214735,
99
+ "learning_rate": 0.00019990866674170983,
100
+ "loss": 1.8705,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.2121212121212122,
105
+ "grad_norm": 0.280526285390802,
106
+ "learning_rate": 0.00019969312910817183,
107
+ "loss": 1.8428,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.3131313131313131,
112
+ "grad_norm": 0.5716516166032067,
113
+ "learning_rate": 0.000199351123114852,
114
+ "loss": 1.8267,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.4141414141414141,
119
+ "grad_norm": 0.22548657696350605,
120
+ "learning_rate": 0.00019888308262251285,
121
+ "loss": 1.7959,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.5151515151515151,
126
+ "grad_norm": 0.2400943754848952,
127
+ "learning_rate": 0.00019828960137631928,
128
+ "loss": 1.8328,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.6161616161616161,
133
+ "grad_norm": 0.24338845667140263,
134
+ "learning_rate": 0.00019757143225262728,
135
+ "loss": 1.8287,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.7171717171717171,
140
+ "grad_norm": 0.23391106368008455,
141
+ "learning_rate": 0.00019672948630390294,
142
+ "loss": 1.8345,
143
  "step": 85
144
  },
145
  {
146
  "epoch": 1.8181818181818183,
147
+ "grad_norm": 0.27223367547235244,
148
+ "learning_rate": 0.00019576483160298246,
149
+ "loss": 1.7731,
150
  "step": 90
151
  },
152
  {
153
  "epoch": 1.9191919191919191,
154
+ "grad_norm": 0.25522514087403636,
155
+ "learning_rate": 0.00019467869188814023,
156
+ "loss": 1.8231,
157
  "step": 95
158
  },
159
  {
160
  "epoch": 2.0,
161
+ "eval_loss": 1.7238675355911255,
162
+ "eval_runtime": 175.3209,
163
+ "eval_samples_per_second": 36.054,
164
+ "eval_steps_per_second": 2.259,
165
  "step": 99
166
  },
167
  {
168
  "epoch": 2.0202020202020203,
169
+ "grad_norm": 0.24156163537353126,
170
+ "learning_rate": 0.00019347244501068312,
171
+ "loss": 1.8199,
172
  "step": 100
173
  },
174
  {
175
  "epoch": 2.121212121212121,
176
+ "grad_norm": 0.26738911600323706,
177
+ "learning_rate": 0.00019214762118704076,
178
+ "loss": 1.7554,
179
  "step": 105
180
  },
181
  {
182
  "epoch": 2.2222222222222223,
183
+ "grad_norm": 0.2974142479913874,
184
+ "learning_rate": 0.000190705901057569,
185
+ "loss": 1.7693,
186
  "step": 110
187
  },
188
  {
189
  "epoch": 2.323232323232323,
190
+ "grad_norm": 0.32489025521491743,
191
+ "learning_rate": 0.00018914911355452895,
192
+ "loss": 1.7036,
193
  "step": 115
194
  },
195
  {
196
  "epoch": 2.4242424242424243,
197
+ "grad_norm": 0.34564309759210704,
198
+ "learning_rate": 0.00018747923358194662,
199
+ "loss": 1.7449,
200
  "step": 120
201
  },
202
  {
203
  "epoch": 2.525252525252525,
204
+ "grad_norm": 0.33251849318291843,
205
+ "learning_rate": 0.00018569837951029595,
206
+ "loss": 1.7556,
207
  "step": 125
208
  },
209
  {
210
  "epoch": 2.6262626262626263,
211
+ "grad_norm": 0.33565250510381917,
212
+ "learning_rate": 0.00018380881048918405,
213
+ "loss": 1.744,
214
  "step": 130
215
  },
216
  {
217
  "epoch": 2.7272727272727275,
218
+ "grad_norm": 0.3427001935365706,
219
+ "learning_rate": 0.00018181292358144703,
220
+ "loss": 1.7234,
221
  "step": 135
222
  },
223
  {
224
  "epoch": 2.8282828282828283,
225
+ "grad_norm": 0.32846296076937864,
226
+ "learning_rate": 0.00017971325072229226,
227
+ "loss": 1.7274,
228
  "step": 140
229
  },
230
  {
231
  "epoch": 2.929292929292929,
232
+ "grad_norm": 0.3446715050022125,
233
+ "learning_rate": 0.0001775124555073452,
234
+ "loss": 1.7516,
235
  "step": 145
236
  },
237
  {
238
+ "epoch": 2.98989898989899,
239
+ "eval_loss": 1.6329888105392456,
240
+ "eval_runtime": 174.9738,
241
+ "eval_samples_per_second": 36.125,
242
+ "eval_steps_per_second": 2.263,
243
+ "step": 148
244
  },
245
  {
246
+ "epoch": 3.0303030303030303,
247
+ "grad_norm": 0.34753378836184123,
248
+ "learning_rate": 0.0001752133298136744,
249
+ "loss": 1.7442,
250
+ "step": 150
251
+ },
252
+ {
253
+ "epoch": 3.1313131313131315,
254
+ "grad_norm": 0.3899145091665638,
255
+ "learning_rate": 0.0001728187902580819,
256
+ "loss": 1.6414,
257
+ "step": 155
258
+ },
259
+ {
260
+ "epoch": 3.2323232323232323,
261
+ "grad_norm": 0.3969944429695798,
262
+ "learning_rate": 0.00017033187449715196,
263
+ "loss": 1.6411,
264
+ "step": 160
265
+ },
266
+ {
267
+ "epoch": 3.3333333333333335,
268
+ "grad_norm": 0.4463802224093316,
269
+ "learning_rate": 0.00016775573737375096,
270
+ "loss": 1.6955,
271
+ "step": 165
272
+ },
273
+ {
274
+ "epoch": 3.4343434343434343,
275
+ "grad_norm": 0.4873799041554826,
276
+ "learning_rate": 0.0001650936469148681,
277
+ "loss": 1.6812,
278
+ "step": 170
279
+ },
280
+ {
281
+ "epoch": 3.5353535353535355,
282
+ "grad_norm": 0.5138644486787001,
283
+ "learning_rate": 0.00016234898018587337,
284
+ "loss": 1.6455,
285
+ "step": 175
286
+ },
287
+ {
288
+ "epoch": 3.6363636363636362,
289
+ "grad_norm": 0.4441989179255284,
290
+ "learning_rate": 0.00015952521900645144,
291
+ "loss": 1.6537,
292
+ "step": 180
293
+ },
294
+ {
295
+ "epoch": 3.7373737373737375,
296
+ "grad_norm": 0.45397642246696135,
297
+ "learning_rate": 0.0001566259455336474,
298
+ "loss": 1.6384,
299
+ "step": 185
300
+ },
301
+ {
302
+ "epoch": 3.8383838383838382,
303
+ "grad_norm": 0.48522658034874977,
304
+ "learning_rate": 0.0001536548377176263,
305
+ "loss": 1.6292,
306
+ "step": 190
307
+ },
308
+ {
309
+ "epoch": 3.9393939393939394,
310
+ "grad_norm": 0.43244857762556244,
311
+ "learning_rate": 0.0001506156646359123,
312
+ "loss": 1.6586,
313
+ "step": 195
314
+ },
315
+ {
316
+ "epoch": 4.0,
317
+ "eval_loss": 1.5279655456542969,
318
+ "eval_runtime": 175.1053,
319
+ "eval_samples_per_second": 36.098,
320
+ "eval_steps_per_second": 2.261,
321
+ "step": 198
322
+ },
323
+ {
324
+ "epoch": 4.040404040404041,
325
+ "grad_norm": 0.48231588190167396,
326
+ "learning_rate": 0.0001475122817120253,
327
+ "loss": 1.6137,
328
+ "step": 200
329
+ },
330
+ {
331
+ "epoch": 4.141414141414141,
332
+ "grad_norm": 0.5842989060014684,
333
+ "learning_rate": 0.00014434862582458135,
334
+ "loss": 1.5082,
335
+ "step": 205
336
+ },
337
+ {
338
+ "epoch": 4.242424242424242,
339
+ "grad_norm": 0.5870996767325578,
340
+ "learning_rate": 0.00014112871031306119,
341
+ "loss": 1.5382,
342
+ "step": 210
343
+ },
344
+ {
345
+ "epoch": 4.343434343434343,
346
+ "grad_norm": 0.6294490520638103,
347
+ "learning_rate": 0.0001378566198865818,
348
+ "loss": 1.5738,
349
+ "step": 215
350
+ },
351
+ {
352
+ "epoch": 4.444444444444445,
353
+ "grad_norm": 0.6361554344671604,
354
+ "learning_rate": 0.00013453650544213076,
355
+ "loss": 1.5609,
356
+ "step": 220
357
+ },
358
+ {
359
+ "epoch": 4.545454545454545,
360
+ "grad_norm": 0.5845910737228225,
361
+ "learning_rate": 0.00013117257879883583,
362
+ "loss": 1.5832,
363
+ "step": 225
364
+ },
365
+ {
366
+ "epoch": 4.646464646464646,
367
+ "grad_norm": 0.6362570401491278,
368
+ "learning_rate": 0.00012776910735495003,
369
+ "loss": 1.5386,
370
+ "step": 230
371
+ },
372
+ {
373
+ "epoch": 4.747474747474747,
374
+ "grad_norm": 0.6079381787055775,
375
+ "learning_rate": 0.0001243304086743309,
376
+ "loss": 1.5408,
377
+ "step": 235
378
+ },
379
+ {
380
+ "epoch": 4.848484848484849,
381
+ "grad_norm": 0.5955494164961348,
382
+ "learning_rate": 0.0001208608450092801,
383
+ "loss": 1.5767,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 4.94949494949495,
388
+ "grad_norm": 0.5941973746172844,
389
+ "learning_rate": 0.00011736481776669306,
390
+ "loss": 1.5571,
391
+ "step": 245
392
+ },
393
+ {
394
+ "epoch": 4.98989898989899,
395
+ "eval_loss": 1.4166467189788818,
396
+ "eval_runtime": 174.6193,
397
+ "eval_samples_per_second": 36.199,
398
+ "eval_steps_per_second": 2.268,
399
+ "step": 247
400
+ },
401
+ {
402
+ "epoch": 5.05050505050505,
403
+ "grad_norm": 0.6955112160268645,
404
+ "learning_rate": 0.0001138467619245374,
405
+ "loss": 1.5011,
406
+ "step": 250
407
+ },
408
+ {
409
+ "epoch": 5.151515151515151,
410
+ "grad_norm": 0.7116916227562953,
411
+ "learning_rate": 0.00011031114040574437,
412
+ "loss": 1.4537,
413
+ "step": 255
414
+ },
415
+ {
416
+ "epoch": 5.252525252525253,
417
+ "grad_norm": 0.8295579161716972,
418
+ "learning_rate": 0.0001067624384166495,
419
+ "loss": 1.398,
420
+ "step": 260
421
+ },
422
+ {
423
+ "epoch": 5.353535353535354,
424
+ "grad_norm": 0.7415551092257379,
425
+ "learning_rate": 0.00010320515775716555,
426
+ "loss": 1.4474,
427
+ "step": 265
428
+ },
429
+ {
430
+ "epoch": 5.454545454545454,
431
+ "grad_norm": 0.7957507416152227,
432
+ "learning_rate": 9.96438111099047e-05,
433
+ "loss": 1.4459,
434
+ "step": 270
435
+ },
436
+ {
437
+ "epoch": 5.555555555555555,
438
+ "grad_norm": 0.8098108632452509,
439
+ "learning_rate": 9.608291631549574e-05,
440
+ "loss": 1.4266,
441
+ "step": 275
442
+ },
443
+ {
444
+ "epoch": 5.656565656565657,
445
+ "grad_norm": 0.8498743613190896,
446
+ "learning_rate": 9.252699064135758e-05,
447
+ "loss": 1.3931,
448
+ "step": 280
449
+ },
450
+ {
451
+ "epoch": 5.757575757575758,
452
+ "grad_norm": 0.7968761297367668,
453
+ "learning_rate": 8.898054505119989e-05,
454
+ "loss": 1.4628,
455
+ "step": 285
456
+ },
457
+ {
458
+ "epoch": 5.858585858585858,
459
+ "grad_norm": 0.8166566096084199,
460
+ "learning_rate": 8.54480784825207e-05,
461
+ "loss": 1.4777,
462
+ "step": 290
463
+ },
464
+ {
465
+ "epoch": 5.959595959595959,
466
+ "grad_norm": 0.7564583944169918,
467
+ "learning_rate": 8.193407213936012e-05,
468
+ "loss": 1.4677,
469
+ "step": 295
470
+ },
471
+ {
472
+ "epoch": 6.0,
473
+ "eval_loss": 1.3067700862884521,
474
+ "eval_runtime": 175.1357,
475
+ "eval_samples_per_second": 36.092,
476
+ "eval_steps_per_second": 2.261,
477
+ "step": 297
478
+ },
479
+ {
480
+ "epoch": 6.0606060606060606,
481
+ "grad_norm": 0.8369827825158762,
482
+ "learning_rate": 7.844298380755003e-05,
483
+ "loss": 1.375,
484
+ "step": 300
485
+ },
486
+ {
487
+ "epoch": 6.161616161616162,
488
+ "grad_norm": 0.9204188282340791,
489
+ "learning_rate": 7.497924219967209e-05,
490
+ "loss": 1.2999,
491
+ "step": 305
492
+ },
493
+ {
494
+ "epoch": 6.262626262626263,
495
+ "grad_norm": 0.9559880600184892,
496
+ "learning_rate": 7.154724133689677e-05,
497
+ "loss": 1.3084,
498
+ "step": 310
499
+ },
500
+ {
501
+ "epoch": 6.363636363636363,
502
+ "grad_norm": 0.9272296702059781,
503
+ "learning_rate": 6.815133497483157e-05,
504
+ "loss": 1.3405,
505
+ "step": 315
506
+ },
507
+ {
508
+ "epoch": 6.4646464646464645,
509
+ "grad_norm": 1.0203421193696094,
510
+ "learning_rate": 6.479583108044899e-05,
511
+ "loss": 1.3165,
512
+ "step": 320
513
+ },
514
+ {
515
+ "epoch": 6.565656565656566,
516
+ "grad_norm": 0.8932381508297077,
517
+ "learning_rate": 6.148498636710092e-05,
518
+ "loss": 1.3641,
519
+ "step": 325
520
+ },
521
+ {
522
+ "epoch": 6.666666666666667,
523
+ "grad_norm": 0.9527012454845684,
524
+ "learning_rate": 5.822300089455211e-05,
525
+ "loss": 1.3179,
526
+ "step": 330
527
+ },
528
+ {
529
+ "epoch": 6.767676767676767,
530
+ "grad_norm": 0.9644270167383292,
531
+ "learning_rate": 5.5014012740883115e-05,
532
+ "loss": 1.3295,
533
+ "step": 335
534
+ },
535
+ {
536
+ "epoch": 6.8686868686868685,
537
+ "grad_norm": 0.9489303492473159,
538
+ "learning_rate": 5.1862092753021754e-05,
539
+ "loss": 1.3482,
540
+ "step": 340
541
+ },
542
+ {
543
+ "epoch": 6.96969696969697,
544
+ "grad_norm": 0.9417366559193787,
545
+ "learning_rate": 4.8771239382562287e-05,
546
+ "loss": 1.3422,
547
+ "step": 345
548
+ },
549
+ {
550
+ "epoch": 6.98989898989899,
551
+ "eval_loss": 1.2082042694091797,
552
+ "eval_runtime": 174.4886,
553
+ "eval_samples_per_second": 36.226,
554
+ "eval_steps_per_second": 2.269,
555
+ "step": 346
556
+ },
557
+ {
558
+ "epoch": 7.070707070707071,
559
+ "grad_norm": 1.0189885797060951,
560
+ "learning_rate": 4.574537361342407e-05,
561
+ "loss": 1.2447,
562
+ "step": 350
563
+ },
564
+ {
565
+ "epoch": 7.171717171717171,
566
+ "grad_norm": 1.0456878645941505,
567
+ "learning_rate": 4.278833398778306e-05,
568
+ "loss": 1.2438,
569
+ "step": 355
570
+ },
571
+ {
572
+ "epoch": 7.2727272727272725,
573
+ "grad_norm": 1.0906515200546398,
574
+ "learning_rate": 3.990387173658774e-05,
575
+ "loss": 1.2135,
576
+ "step": 360
577
+ },
578
+ {
579
+ "epoch": 7.373737373737374,
580
+ "grad_norm": 1.1138045736907602,
581
+ "learning_rate": 3.7095646020835754e-05,
582
+ "loss": 1.2152,
583
+ "step": 365
584
+ },
585
+ {
586
+ "epoch": 7.474747474747475,
587
+ "grad_norm": 1.1333935442018617,
588
+ "learning_rate": 3.436721928964819e-05,
589
+ "loss": 1.2004,
590
+ "step": 370
591
+ },
592
+ {
593
+ "epoch": 7.575757575757576,
594
+ "grad_norm": 1.0135992096066218,
595
+ "learning_rate": 3.172205276103033e-05,
596
+ "loss": 1.1904,
597
+ "step": 375
598
+ },
599
+ {
600
+ "epoch": 7.6767676767676765,
601
+ "grad_norm": 1.0811091792166911,
602
+ "learning_rate": 2.916350203105207e-05,
603
+ "loss": 1.2475,
604
+ "step": 380
605
+ },
606
+ {
607
+ "epoch": 7.777777777777778,
608
+ "grad_norm": 1.1722915377984628,
609
+ "learning_rate": 2.669481281701739e-05,
610
+ "loss": 1.2273,
611
+ "step": 385
612
+ },
613
+ {
614
+ "epoch": 7.878787878787879,
615
+ "grad_norm": 1.0151820296117031,
616
+ "learning_rate": 2.4319116840023813e-05,
617
+ "loss": 1.2462,
618
+ "step": 390
619
+ },
620
+ {
621
+ "epoch": 7.97979797979798,
622
+ "grad_norm": 1.0359433658999384,
623
+ "learning_rate": 2.2039427852134788e-05,
624
+ "loss": 1.2609,
625
+ "step": 395
626
+ },
627
+ {
628
+ "epoch": 8.0,
629
+ "eval_loss": 1.137781023979187,
630
+ "eval_runtime": 163.6859,
631
+ "eval_samples_per_second": 38.617,
632
+ "eval_steps_per_second": 2.419,
633
+ "step": 396
634
+ },
635
+ {
636
+ "epoch": 8.080808080808081,
637
+ "grad_norm": 1.0914414519615843,
638
+ "learning_rate": 1.985863781320435e-05,
639
+ "loss": 1.1457,
640
+ "step": 400
641
+ },
642
+ {
643
+ "epoch": 8.181818181818182,
644
+ "grad_norm": 1.2849174669504693,
645
+ "learning_rate": 1.777951322220508e-05,
646
+ "loss": 1.1925,
647
+ "step": 405
648
+ },
649
+ {
650
+ "epoch": 8.282828282828282,
651
+ "grad_norm": 1.0562037397284274,
652
+ "learning_rate": 1.580469160771253e-05,
653
+ "loss": 1.1653,
654
+ "step": 410
655
+ },
656
+ {
657
+ "epoch": 8.383838383838384,
658
+ "grad_norm": 1.1942325172166053,
659
+ "learning_rate": 1.3936678181998374e-05,
660
+ "loss": 1.1451,
661
+ "step": 415
662
+ },
663
+ {
664
+ "epoch": 8.484848484848484,
665
+ "grad_norm": 1.2292184186394104,
666
+ "learning_rate": 1.2177842662977135e-05,
667
+ "loss": 1.1432,
668
+ "step": 420
669
+ },
670
+ {
671
+ "epoch": 8.585858585858587,
672
+ "grad_norm": 1.1449254109310076,
673
+ "learning_rate": 1.0530416268037702e-05,
674
+ "loss": 1.1459,
675
+ "step": 425
676
+ },
677
+ {
678
+ "epoch": 8.686868686868687,
679
+ "grad_norm": 1.1159137674762092,
680
+ "learning_rate": 8.99648888357335e-06,
681
+ "loss": 1.1889,
682
+ "step": 430
683
+ },
684
+ {
685
+ "epoch": 8.787878787878787,
686
+ "grad_norm": 1.1893818134430183,
687
+ "learning_rate": 7.578006413801075e-06,
688
+ "loss": 1.1809,
689
+ "step": 435
690
+ },
691
+ {
692
+ "epoch": 8.88888888888889,
693
+ "grad_norm": 1.131890459862098,
694
+ "learning_rate": 6.276768312233228e-06,
695
+ "loss": 1.1806,
696
+ "step": 440
697
+ },
698
+ {
699
+ "epoch": 8.98989898989899,
700
+ "grad_norm": 1.0926795618787801,
701
+ "learning_rate": 5.094425298933136e-06,
702
+ "loss": 1.1647,
703
+ "step": 445
704
+ },
705
+ {
706
+ "epoch": 8.98989898989899,
707
+ "eval_loss": 1.107386827468872,
708
+ "eval_runtime": 163.223,
709
+ "eval_samples_per_second": 38.726,
710
+ "eval_steps_per_second": 2.426,
711
+ "step": 445
712
+ },
713
+ {
714
+ "epoch": 9.090909090909092,
715
+ "grad_norm": 1.1051037332814697,
716
+ "learning_rate": 4.0324772664503296e-06,
717
+ "loss": 1.1438,
718
+ "step": 450
719
+ },
720
+ {
721
+ "epoch": 9.191919191919192,
722
+ "grad_norm": 1.164376038164282,
723
+ "learning_rate": 3.092271377092215e-06,
724
+ "loss": 1.1481,
725
+ "step": 455
726
+ },
727
+ {
728
+ "epoch": 9.292929292929292,
729
+ "grad_norm": 1.2303081966513765,
730
+ "learning_rate": 2.2750003539455998e-06,
731
+ "loss": 1.1202,
732
+ "step": 460
733
+ },
734
+ {
735
+ "epoch": 9.393939393939394,
736
+ "grad_norm": 1.2010643624794166,
737
+ "learning_rate": 1.5817009678162685e-06,
738
+ "loss": 1.142,
739
+ "step": 465
740
+ },
741
+ {
742
+ "epoch": 9.494949494949495,
743
+ "grad_norm": 1.2423558300638782,
744
+ "learning_rate": 1.013252722005842e-06,
745
+ "loss": 1.1842,
746
+ "step": 470
747
+ },
748
+ {
749
+ "epoch": 9.595959595959595,
750
+ "grad_norm": 1.1980002179676799,
751
+ "learning_rate": 5.703767365946466e-07,
752
+ "loss": 1.1236,
753
+ "step": 475
754
+ },
755
+ {
756
+ "epoch": 9.696969696969697,
757
+ "grad_norm": 1.2034300162155251,
758
+ "learning_rate": 2.536348336456551e-07,
759
+ "loss": 1.1168,
760
+ "step": 480
761
+ },
762
+ {
763
+ "epoch": 9.797979797979798,
764
+ "grad_norm": 1.2022297984086214,
765
+ "learning_rate": 6.342882449029696e-08,
766
+ "loss": 1.1133,
767
+ "step": 485
768
+ },
769
+ {
770
+ "epoch": 9.8989898989899,
771
+ "grad_norm": 1.0699752057421905,
772
+ "learning_rate": 0.0,
773
+ "loss": 1.1571,
774
+ "step": 490
775
+ },
776
+ {
777
+ "epoch": 9.8989898989899,
778
+ "eval_loss": 1.1029597520828247,
779
+ "eval_runtime": 163.4283,
780
+ "eval_samples_per_second": 38.678,
781
+ "eval_steps_per_second": 2.423,
782
+ "step": 490
783
+ },
784
+ {
785
+ "epoch": 9.8989898989899,
786
+ "step": 490,
787
+ "total_flos": 2344635780825088.0,
788
+ "train_loss": 1.5290005391957808,
789
+ "train_runtime": 5413.3076,
790
+ "train_samples_per_second": 11.677,
791
  "train_steps_per_second": 0.091
792
  }
793
  ],
794
  "logging_steps": 5,
795
+ "max_steps": 490,
796
  "num_input_tokens_seen": 0,
797
+ "num_train_epochs": 10,
798
  "save_steps": 25,
799
  "stateful_callbacks": {
800
  "TrainerControl": {
 
808
  "attributes": {}
809
  }
810
  },
811
+ "total_flos": 2344635780825088.0,
812
  "train_batch_size": 8,
813
  "trial_name": null,
814
  "trial_params": null