File size: 5,698 Bytes
171d73c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
[
    {
        "step": 30,
        "loss": 1.4062,
        "type": "train",
        "grad_norm": 8.482284545898438,
        "learning_rate": 1.8347107438016528e-06,
        "epoch": 0.2475502836513667
    },
    {
        "step": 30,
        "loss": 1.1563361883163452,
        "type": "eval",
        "eval_runtime": 238.6436,
        "eval_samples_per_second": 2.891,
        "eval_steps_per_second": 2.891,
        "epoch": 0.2475502836513667
    },
    {
        "step": 60,
        "loss": 1.1662,
        "type": "train",
        "grad_norm": 6.751504898071289,
        "learning_rate": 1.6694214876033058e-06,
        "epoch": 0.4951005673027334
    },
    {
        "step": 60,
        "loss": 1.0914983749389648,
        "type": "eval",
        "eval_runtime": 248.6513,
        "eval_samples_per_second": 2.775,
        "eval_steps_per_second": 2.775,
        "epoch": 0.4951005673027334
    },
    {
        "step": 90,
        "loss": 1.1109,
        "type": "train",
        "grad_norm": 4.135236740112305,
        "learning_rate": 1.5041322314049587e-06,
        "epoch": 0.7426508509541001
    },
    {
        "step": 90,
        "loss": 1.0496803522109985,
        "type": "eval",
        "eval_runtime": 239.5109,
        "eval_samples_per_second": 2.881,
        "eval_steps_per_second": 2.881,
        "epoch": 0.7426508509541001
    },
    {
        "step": 120,
        "loss": 1.0639,
        "type": "train",
        "grad_norm": 5.695450782775879,
        "learning_rate": 1.3388429752066116e-06,
        "epoch": 0.9902011346054668
    },
    {
        "step": 120,
        "loss": 1.0101323127746582,
        "type": "eval",
        "eval_runtime": 234.2087,
        "eval_samples_per_second": 2.946,
        "eval_steps_per_second": 2.946,
        "epoch": 0.9902011346054668
    },
    {
        "step": 150,
        "loss": 0.9416,
        "type": "train",
        "grad_norm": 7.571918487548828,
        "learning_rate": 1.1735537190082645e-06,
        "epoch": 1.2377514182568334
    },
    {
        "step": 150,
        "loss": 1.0040597915649414,
        "type": "eval",
        "eval_runtime": 247.1746,
        "eval_samples_per_second": 2.792,
        "eval_steps_per_second": 2.792,
        "epoch": 1.2377514182568334
    },
    {
        "step": 180,
        "loss": 0.925,
        "type": "train",
        "grad_norm": 4.53548002243042,
        "learning_rate": 1.0082644628099172e-06,
        "epoch": 1.4853017019082002
    },
    {
        "step": 180,
        "loss": 0.9938931465148926,
        "type": "eval",
        "eval_runtime": 234.8252,
        "eval_samples_per_second": 2.938,
        "eval_steps_per_second": 2.938,
        "epoch": 1.4853017019082002
    },
    {
        "step": 210,
        "loss": 0.9033,
        "type": "train",
        "grad_norm": 5.361794948577881,
        "learning_rate": 8.429752066115701e-07,
        "epoch": 1.7328519855595668
    },
    {
        "step": 210,
        "loss": 0.9868502020835876,
        "type": "eval",
        "eval_runtime": 234.8772,
        "eval_samples_per_second": 2.938,
        "eval_steps_per_second": 2.938,
        "epoch": 1.7328519855595668
    },
    {
        "step": 240,
        "loss": 0.9087,
        "type": "train",
        "grad_norm": 4.525313377380371,
        "learning_rate": 6.776859504132231e-07,
        "epoch": 1.9804022692109333
    },
    {
        "step": 240,
        "loss": 0.9824326634407043,
        "type": "eval",
        "eval_runtime": 234.8918,
        "eval_samples_per_second": 2.938,
        "eval_steps_per_second": 2.938,
        "epoch": 1.9804022692109333
    },
    {
        "step": 270,
        "loss": 0.8166,
        "type": "train",
        "grad_norm": 4.654973030090332,
        "learning_rate": 5.12396694214876e-07,
        "epoch": 2.2279525528623
    },
    {
        "step": 270,
        "loss": 0.9874295592308044,
        "type": "eval",
        "eval_runtime": 243.7953,
        "eval_samples_per_second": 2.83,
        "eval_steps_per_second": 2.83,
        "epoch": 2.2279525528623
    },
    {
        "step": 300,
        "loss": 0.8226,
        "type": "train",
        "grad_norm": 5.9346442222595215,
        "learning_rate": 3.471074380165289e-07,
        "epoch": 2.475502836513667
    },
    {
        "step": 300,
        "loss": 0.9854046106338501,
        "type": "eval",
        "eval_runtime": 243.3847,
        "eval_samples_per_second": 2.835,
        "eval_steps_per_second": 2.835,
        "epoch": 2.475502836513667
    },
    {
        "step": 330,
        "loss": 0.8289,
        "type": "train",
        "grad_norm": 4.637845516204834,
        "learning_rate": 1.818181818181818e-07,
        "epoch": 2.7230531201650336
    },
    {
        "step": 330,
        "loss": 0.9841367602348328,
        "type": "eval",
        "eval_runtime": 242.6797,
        "eval_samples_per_second": 2.843,
        "eval_steps_per_second": 2.843,
        "epoch": 2.7230531201650336
    },
    {
        "step": 360,
        "loss": 0.8137,
        "type": "train",
        "grad_norm": 4.132049560546875,
        "learning_rate": 1.652892561983471e-08,
        "epoch": 2.9706034038164004
    },
    {
        "step": 360,
        "loss": 0.9834251403808594,
        "type": "eval",
        "eval_runtime": 242.7445,
        "eval_samples_per_second": 2.842,
        "eval_steps_per_second": 2.842,
        "epoch": 2.9706034038164004
    },
    {
        "step": 363,
        "train_runtime": 22924.4691,
        "train_samples_per_second": 0.507,
        "train_steps_per_second": 0.016,
        "total_flos": 7877706776576.0,
        "train_loss": 0.9745982139892158,
        "epoch": 2.9953584321815367
    }
]