SouthMemphis commited on
Commit
0da872b
·
verified ·
1 Parent(s): cee3fb2

SouthMemphis/ViT_military_aircraft

Browse files
README.md ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: google/vit-base-patch16-224-in21k
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - imagefolder
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: vit-base-beans
13
+ results:
14
+ - task:
15
+ name: Image Classification
16
+ type: image-classification
17
+ dataset:
18
+ name: imagefolder
19
+ type: imagefolder
20
+ config: default
21
+ split: train
22
+ args: default
23
+ metrics:
24
+ - name: Accuracy
25
+ type: accuracy
26
+ value: 0.90272614622057
27
+ ---
28
+
29
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
30
+ should probably proofread and complete it, then remove this comment. -->
31
+
32
+ # vit-base-beans
33
+
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
+ It achieves the following results on the evaluation set:
36
+ - Loss: 0.3643
37
+ - Accuracy: 0.9027
38
+
39
+ ## Model description
40
+
41
+ More information needed
42
+
43
+ ## Intended uses & limitations
44
+
45
+ More information needed
46
+
47
+ ## Training and evaluation data
48
+
49
+ More information needed
50
+
51
+ ## Training procedure
52
+
53
+ ### Training hyperparameters
54
+
55
+ The following hyperparameters were used during training:
56
+ - learning_rate: 0.0002
57
+ - train_batch_size: 16
58
+ - eval_batch_size: 8
59
+ - seed: 42
60
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
61
+ - lr_scheduler_type: linear
62
+ - num_epochs: 2
63
+ - mixed_precision_training: Native AMP
64
+
65
+ ### Training results
66
+
67
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
69
+ | 3.5924 | 0.0620 | 100 | 3.5675 | 0.1927 |
70
+ | 3.0189 | 0.1239 | 200 | 3.0313 | 0.3047 |
71
+ | 2.5541 | 0.1859 | 300 | 2.5575 | 0.3956 |
72
+ | 2.114 | 0.2478 | 400 | 2.2332 | 0.4571 |
73
+ | 1.9624 | 0.3098 | 500 | 1.9455 | 0.5596 |
74
+ | 1.6749 | 0.3717 | 600 | 1.7370 | 0.5787 |
75
+ | 1.5852 | 0.4337 | 700 | 1.4947 | 0.6439 |
76
+ | 1.1875 | 0.4957 | 800 | 1.4151 | 0.6468 |
77
+ | 1.5114 | 0.5576 | 900 | 1.2709 | 0.6820 |
78
+ | 1.3122 | 0.6196 | 1000 | 1.1940 | 0.6939 |
79
+ | 1.0721 | 0.6815 | 1100 | 1.0757 | 0.7261 |
80
+ | 0.8249 | 0.7435 | 1200 | 0.9666 | 0.7576 |
81
+ | 0.7944 | 0.8055 | 1300 | 0.9101 | 0.7708 |
82
+ | 0.8032 | 0.8674 | 1400 | 0.9011 | 0.7691 |
83
+ | 0.7479 | 0.9294 | 1500 | 0.7409 | 0.8067 |
84
+ | 0.5997 | 0.9913 | 1600 | 0.7326 | 0.8110 |
85
+ | 0.5005 | 1.0533 | 1700 | 0.6769 | 0.8211 |
86
+ | 0.4107 | 1.1152 | 1800 | 0.6375 | 0.8374 |
87
+ | 0.4596 | 1.1772 | 1900 | 0.6302 | 0.8304 |
88
+ | 0.2544 | 1.2392 | 2000 | 0.5805 | 0.8400 |
89
+ | 0.2983 | 1.3011 | 2100 | 0.5480 | 0.8501 |
90
+ | 0.3214 | 1.3631 | 2200 | 0.5053 | 0.8683 |
91
+ | 0.2384 | 1.4250 | 2300 | 0.4929 | 0.8713 |
92
+ | 0.2397 | 1.4870 | 2400 | 0.4664 | 0.8742 |
93
+ | 0.3448 | 1.5489 | 2500 | 0.4690 | 0.8755 |
94
+ | 0.3129 | 1.6109 | 2600 | 0.4351 | 0.8843 |
95
+ | 0.1027 | 1.6729 | 2700 | 0.4311 | 0.8846 |
96
+ | 0.2086 | 1.7348 | 2800 | 0.4088 | 0.8897 |
97
+ | 0.1683 | 1.7968 | 2900 | 0.4133 | 0.8919 |
98
+ | 0.2767 | 1.8587 | 3000 | 0.3851 | 0.8964 |
99
+ | 0.1582 | 1.9207 | 3100 | 0.3703 | 0.9018 |
100
+ | 0.1421 | 1.9827 | 3200 | 0.3643 | 0.9027 |
101
+
102
+
103
+ ### Framework versions
104
+
105
+ - Transformers 4.44.2
106
+ - Pytorch 2.4.1+cu121
107
+ - Datasets 3.2.0
108
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.90272614622057,
4
+ "eval_loss": 0.36434125900268555,
5
+ "eval_runtime": 107.3324,
6
+ "eval_samples_per_second": 60.15,
7
+ "eval_steps_per_second": 7.519,
8
+ "total_flos": 4.004423768814723e+18,
9
+ "train_loss": 0.9726454161726114,
10
+ "train_runtime": 5145.5507,
11
+ "train_samples_per_second": 10.036,
12
+ "train_steps_per_second": 0.627
13
+ }
config.json ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "A10",
13
+ "1": "A400M",
14
+ "2": "AG600",
15
+ "3": "AH64",
16
+ "4": "AV8B",
17
+ "5": "An124",
18
+ "6": "An22",
19
+ "7": "An225",
20
+ "8": "An72",
21
+ "9": "B1",
22
+ "10": "B2",
23
+ "11": "B21",
24
+ "12": "B52",
25
+ "13": "Be200",
26
+ "14": "C130",
27
+ "15": "C17",
28
+ "16": "C2",
29
+ "17": "C390",
30
+ "18": "C5",
31
+ "19": "CH47",
32
+ "20": "CL415",
33
+ "21": "E2",
34
+ "22": "E7",
35
+ "23": "EF2000",
36
+ "24": "F117",
37
+ "25": "F14",
38
+ "26": "F15",
39
+ "27": "F16",
40
+ "28": "F18",
41
+ "29": "F22",
42
+ "30": "F35",
43
+ "31": "F4",
44
+ "32": "H6",
45
+ "33": "J10",
46
+ "34": "J20",
47
+ "35": "JAS39",
48
+ "36": "JF17",
49
+ "37": "JH7",
50
+ "38": "KC135",
51
+ "39": "KF21",
52
+ "40": "KJ600",
53
+ "41": "Ka27",
54
+ "42": "Ka52",
55
+ "43": "MQ9",
56
+ "44": "Mi24",
57
+ "45": "Mi26",
58
+ "46": "Mi28",
59
+ "47": "Mig29",
60
+ "48": "Mig31",
61
+ "49": "Mirage2000",
62
+ "50": "P3",
63
+ "51": "RQ4",
64
+ "52": "Rafale",
65
+ "53": "SR71",
66
+ "54": "Su24",
67
+ "55": "Su25",
68
+ "56": "Su34",
69
+ "57": "Su57",
70
+ "58": "TB001",
71
+ "59": "TB2",
72
+ "60": "Tornado",
73
+ "61": "Tu160",
74
+ "62": "Tu22M",
75
+ "63": "Tu95",
76
+ "64": "U2",
77
+ "65": "UH60",
78
+ "66": "US2",
79
+ "67": "V22",
80
+ "68": "Vulcan",
81
+ "69": "WZ7",
82
+ "70": "XB70",
83
+ "71": "Y20",
84
+ "72": "YF23",
85
+ "73": "Z19"
86
+ },
87
+ "image_size": 224,
88
+ "initializer_range": 0.02,
89
+ "intermediate_size": 3072,
90
+ "label2id": {
91
+ "A10": 0,
92
+ "A400M": 1,
93
+ "AG600": 2,
94
+ "AH64": 3,
95
+ "AV8B": 4,
96
+ "An124": 5,
97
+ "An22": 6,
98
+ "An225": 7,
99
+ "An72": 8,
100
+ "B1": 9,
101
+ "B2": 10,
102
+ "B21": 11,
103
+ "B52": 12,
104
+ "Be200": 13,
105
+ "C130": 14,
106
+ "C17": 15,
107
+ "C2": 16,
108
+ "C390": 17,
109
+ "C5": 18,
110
+ "CH47": 19,
111
+ "CL415": 20,
112
+ "E2": 21,
113
+ "E7": 22,
114
+ "EF2000": 23,
115
+ "F117": 24,
116
+ "F14": 25,
117
+ "F15": 26,
118
+ "F16": 27,
119
+ "F18": 28,
120
+ "F22": 29,
121
+ "F35": 30,
122
+ "F4": 31,
123
+ "H6": 32,
124
+ "J10": 33,
125
+ "J20": 34,
126
+ "JAS39": 35,
127
+ "JF17": 36,
128
+ "JH7": 37,
129
+ "KC135": 38,
130
+ "KF21": 39,
131
+ "KJ600": 40,
132
+ "Ka27": 41,
133
+ "Ka52": 42,
134
+ "MQ9": 43,
135
+ "Mi24": 44,
136
+ "Mi26": 45,
137
+ "Mi28": 46,
138
+ "Mig29": 47,
139
+ "Mig31": 48,
140
+ "Mirage2000": 49,
141
+ "P3": 50,
142
+ "RQ4": 51,
143
+ "Rafale": 52,
144
+ "SR71": 53,
145
+ "Su24": 54,
146
+ "Su25": 55,
147
+ "Su34": 56,
148
+ "Su57": 57,
149
+ "TB001": 58,
150
+ "TB2": 59,
151
+ "Tornado": 60,
152
+ "Tu160": 61,
153
+ "Tu22M": 62,
154
+ "Tu95": 63,
155
+ "U2": 64,
156
+ "UH60": 65,
157
+ "US2": 66,
158
+ "V22": 67,
159
+ "Vulcan": 68,
160
+ "WZ7": 69,
161
+ "XB70": 70,
162
+ "Y20": 71,
163
+ "YF23": 72,
164
+ "Z19": 73
165
+ },
166
+ "layer_norm_eps": 1e-12,
167
+ "model_type": "vit",
168
+ "num_attention_heads": 12,
169
+ "num_channels": 3,
170
+ "num_hidden_layers": 12,
171
+ "patch_size": 16,
172
+ "problem_type": "single_label_classification",
173
+ "qkv_bias": true,
174
+ "torch_dtype": "float32",
175
+ "transformers_version": "4.44.2"
176
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.90272614622057,
4
+ "eval_loss": 0.36434125900268555,
5
+ "eval_runtime": 107.3324,
6
+ "eval_samples_per_second": 60.15,
7
+ "eval_steps_per_second": 7.519
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f58a48f9eca60950d8a7ab9033574c67bdafc2c9e1a20a7473fbf4f3f45ce89
3
+ size 343445456
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTImageProcessor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
runs/Dec26_15-46-49_35b123c34d0e/events.out.tfevents.1735228020.35b123c34d0e.40.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf82f8f1dc5adb009881319fb579921da9b1a319aad1d7396fa29e8ea614a186
3
+ size 7498
runs/Dec26_15-47-14_35b123c34d0e/events.out.tfevents.1735228040.35b123c34d0e.40.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93547236499a48a7101eb38287b8e2f1094a2ca5bc497c4b4527debc0bb40d4
3
+ size 85918
runs/Dec26_15-47-14_35b123c34d0e/events.out.tfevents.1735233522.35b123c34d0e.40.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d4ea215219724069427fe755a38a631d9b2c8c9e305a574830300fbd2d0020c
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 4.004423768814723e+18,
4
+ "train_loss": 0.9726454161726114,
5
+ "train_runtime": 5145.5507,
6
+ "train_samples_per_second": 10.036,
7
+ "train_steps_per_second": 0.627
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.36434125900268555,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-3200",
4
+ "epoch": 2.0,
5
+ "eval_steps": 100,
6
+ "global_step": 3228,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.006195786864931847,
13
+ "grad_norm": 1.9642935991287231,
14
+ "learning_rate": 0.00019938042131350682,
15
+ "loss": 3.9212,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.012391573729863693,
20
+ "grad_norm": 1.8843648433685303,
21
+ "learning_rate": 0.00019876084262701366,
22
+ "loss": 4.1331,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.01858736059479554,
27
+ "grad_norm": 1.7067281007766724,
28
+ "learning_rate": 0.00019814126394052047,
29
+ "loss": 4.0633,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.024783147459727387,
34
+ "grad_norm": 1.7453334331512451,
35
+ "learning_rate": 0.00019752168525402728,
36
+ "loss": 3.9709,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.030978934324659233,
41
+ "grad_norm": 2.055915355682373,
42
+ "learning_rate": 0.0001969021065675341,
43
+ "loss": 3.8473,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.03717472118959108,
48
+ "grad_norm": 1.8063888549804688,
49
+ "learning_rate": 0.00019628252788104092,
50
+ "loss": 3.7734,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.04337050805452292,
55
+ "grad_norm": 1.8397654294967651,
56
+ "learning_rate": 0.0001956629491945477,
57
+ "loss": 3.8279,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.04956629491945477,
62
+ "grad_norm": 2.0568203926086426,
63
+ "learning_rate": 0.00019504337050805452,
64
+ "loss": 3.7613,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.055762081784386616,
69
+ "grad_norm": 2.3892436027526855,
70
+ "learning_rate": 0.00019442379182156135,
71
+ "loss": 3.6098,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.061957868649318466,
76
+ "grad_norm": 2.0522940158843994,
77
+ "learning_rate": 0.00019380421313506816,
78
+ "loss": 3.5924,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.061957868649318466,
83
+ "eval_accuracy": 0.1926889714993804,
84
+ "eval_loss": 3.567479133605957,
85
+ "eval_runtime": 134.6311,
86
+ "eval_samples_per_second": 47.953,
87
+ "eval_steps_per_second": 5.994,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.06815365551425032,
92
+ "grad_norm": 2.773609161376953,
93
+ "learning_rate": 0.00019318463444857497,
94
+ "loss": 3.5271,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 0.07434944237918216,
99
+ "grad_norm": 2.709200620651245,
100
+ "learning_rate": 0.00019256505576208178,
101
+ "loss": 3.5848,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 0.080545229244114,
106
+ "grad_norm": 2.2230889797210693,
107
+ "learning_rate": 0.00019194547707558862,
108
+ "loss": 3.419,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.08674101610904585,
113
+ "grad_norm": 2.003593683242798,
114
+ "learning_rate": 0.00019132589838909543,
115
+ "loss": 3.4474,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 0.09293680297397769,
120
+ "grad_norm": 2.4022834300994873,
121
+ "learning_rate": 0.00019070631970260224,
122
+ "loss": 3.3324,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 0.09913258983890955,
127
+ "grad_norm": 2.2967748641967773,
128
+ "learning_rate": 0.00019008674101610905,
129
+ "loss": 3.1463,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 0.10532837670384139,
134
+ "grad_norm": 2.498286724090576,
135
+ "learning_rate": 0.0001894671623296159,
136
+ "loss": 3.2931,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 0.11152416356877323,
141
+ "grad_norm": 2.418286085128784,
142
+ "learning_rate": 0.0001888475836431227,
143
+ "loss": 3.24,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.11771995043370508,
148
+ "grad_norm": 2.7124204635620117,
149
+ "learning_rate": 0.0001882280049566295,
150
+ "loss": 3.1876,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 0.12391573729863693,
155
+ "grad_norm": 2.0705409049987793,
156
+ "learning_rate": 0.00018760842627013632,
157
+ "loss": 3.0189,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.12391573729863693,
162
+ "eval_accuracy": 0.30467781908302355,
163
+ "eval_loss": 3.0312585830688477,
164
+ "eval_runtime": 106.8938,
165
+ "eval_samples_per_second": 60.396,
166
+ "eval_steps_per_second": 7.55,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.13011152416356878,
171
+ "grad_norm": 2.31803822517395,
172
+ "learning_rate": 0.00018698884758364313,
173
+ "loss": 2.8399,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 0.13630731102850063,
178
+ "grad_norm": 2.759551525115967,
179
+ "learning_rate": 0.00018636926889714994,
180
+ "loss": 2.9606,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 0.14250309789343246,
185
+ "grad_norm": 2.6668756008148193,
186
+ "learning_rate": 0.00018574969021065675,
187
+ "loss": 2.8822,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 0.14869888475836432,
192
+ "grad_norm": 2.4296159744262695,
193
+ "learning_rate": 0.00018513011152416359,
194
+ "loss": 3.0186,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 0.15489467162329615,
199
+ "grad_norm": 2.854280948638916,
200
+ "learning_rate": 0.0001845105328376704,
201
+ "loss": 2.85,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 0.161090458488228,
206
+ "grad_norm": 2.2501845359802246,
207
+ "learning_rate": 0.0001838909541511772,
208
+ "loss": 2.7842,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 0.16728624535315986,
213
+ "grad_norm": 2.2110307216644287,
214
+ "learning_rate": 0.00018327137546468402,
215
+ "loss": 2.6941,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 0.1734820322180917,
220
+ "grad_norm": 3.233548164367676,
221
+ "learning_rate": 0.00018265179677819085,
222
+ "loss": 2.7383,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 0.17967781908302355,
227
+ "grad_norm": 3.4657251834869385,
228
+ "learning_rate": 0.00018203221809169766,
229
+ "loss": 2.8092,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 0.18587360594795538,
234
+ "grad_norm": 2.545624256134033,
235
+ "learning_rate": 0.00018141263940520447,
236
+ "loss": 2.5541,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 0.18587360594795538,
241
+ "eval_accuracy": 0.3956009913258984,
242
+ "eval_loss": 2.557483434677124,
243
+ "eval_runtime": 106.1092,
244
+ "eval_samples_per_second": 60.843,
245
+ "eval_steps_per_second": 7.605,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.19206939281288724,
250
+ "grad_norm": 3.7214767932891846,
251
+ "learning_rate": 0.00018079306071871128,
252
+ "loss": 2.673,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 0.1982651796778191,
257
+ "grad_norm": 2.865368604660034,
258
+ "learning_rate": 0.00018017348203221812,
259
+ "loss": 2.4952,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 0.20446096654275092,
264
+ "grad_norm": 2.8324084281921387,
265
+ "learning_rate": 0.0001795539033457249,
266
+ "loss": 2.5799,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 0.21065675340768278,
271
+ "grad_norm": 2.980775833129883,
272
+ "learning_rate": 0.00017893432465923171,
273
+ "loss": 2.518,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 0.21685254027261464,
278
+ "grad_norm": 3.9543583393096924,
279
+ "learning_rate": 0.00017831474597273855,
280
+ "loss": 2.3632,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 0.22304832713754646,
285
+ "grad_norm": 6.1679368019104,
286
+ "learning_rate": 0.00017769516728624536,
287
+ "loss": 2.4168,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 0.22924411400247832,
292
+ "grad_norm": 3.2867679595947266,
293
+ "learning_rate": 0.00017707558859975217,
294
+ "loss": 2.4561,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 0.23543990086741015,
299
+ "grad_norm": 3.4070069789886475,
300
+ "learning_rate": 0.00017645600991325898,
301
+ "loss": 2.3094,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 0.241635687732342,
306
+ "grad_norm": 2.8205995559692383,
307
+ "learning_rate": 0.00017583643122676582,
308
+ "loss": 2.2237,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 0.24783147459727387,
313
+ "grad_norm": 3.1000003814697266,
314
+ "learning_rate": 0.00017521685254027263,
315
+ "loss": 2.114,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 0.24783147459727387,
320
+ "eval_accuracy": 0.45709417596034696,
321
+ "eval_loss": 2.233164072036743,
322
+ "eval_runtime": 106.5812,
323
+ "eval_samples_per_second": 60.574,
324
+ "eval_steps_per_second": 7.572,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 0.2540272614622057,
329
+ "grad_norm": 3.512111186981201,
330
+ "learning_rate": 0.00017459727385377944,
331
+ "loss": 2.1117,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 0.26022304832713755,
336
+ "grad_norm": 4.639825344085693,
337
+ "learning_rate": 0.00017397769516728625,
338
+ "loss": 2.0458,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 0.2664188351920694,
343
+ "grad_norm": 3.5039663314819336,
344
+ "learning_rate": 0.00017335811648079309,
345
+ "loss": 2.1726,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 0.27261462205700127,
350
+ "grad_norm": 3.6471774578094482,
351
+ "learning_rate": 0.0001727385377942999,
352
+ "loss": 2.1414,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 0.2788104089219331,
357
+ "grad_norm": 4.325891971588135,
358
+ "learning_rate": 0.0001721189591078067,
359
+ "loss": 2.0651,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 0.2850061957868649,
364
+ "grad_norm": 2.3341152667999268,
365
+ "learning_rate": 0.00017149938042131352,
366
+ "loss": 2.0514,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 0.29120198265179675,
371
+ "grad_norm": 3.562957525253296,
372
+ "learning_rate": 0.00017087980173482033,
373
+ "loss": 1.8995,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 0.29739776951672864,
378
+ "grad_norm": 4.375968933105469,
379
+ "learning_rate": 0.00017026022304832714,
380
+ "loss": 1.981,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 0.30359355638166047,
385
+ "grad_norm": 2.5124247074127197,
386
+ "learning_rate": 0.00016964064436183395,
387
+ "loss": 1.8624,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 0.3097893432465923,
392
+ "grad_norm": 4.64390230178833,
393
+ "learning_rate": 0.00016902106567534078,
394
+ "loss": 1.9624,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 0.3097893432465923,
399
+ "eval_accuracy": 0.559634448574969,
400
+ "eval_loss": 1.945489764213562,
401
+ "eval_runtime": 107.2688,
402
+ "eval_samples_per_second": 60.185,
403
+ "eval_steps_per_second": 7.523,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 0.3159851301115242,
408
+ "grad_norm": 3.2166125774383545,
409
+ "learning_rate": 0.0001684014869888476,
410
+ "loss": 1.8621,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 0.322180916976456,
415
+ "grad_norm": 3.2079391479492188,
416
+ "learning_rate": 0.0001677819083023544,
417
+ "loss": 2.048,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 0.32837670384138784,
422
+ "grad_norm": 4.8925652503967285,
423
+ "learning_rate": 0.0001671623296158612,
424
+ "loss": 1.9486,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 0.3345724907063197,
429
+ "grad_norm": 3.3474984169006348,
430
+ "learning_rate": 0.00016654275092936805,
431
+ "loss": 1.9194,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 0.34076827757125155,
436
+ "grad_norm": 3.6939406394958496,
437
+ "learning_rate": 0.00016592317224287486,
438
+ "loss": 1.8698,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 0.3469640644361834,
443
+ "grad_norm": 3.176316022872925,
444
+ "learning_rate": 0.00016530359355638167,
445
+ "loss": 1.7831,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 0.35315985130111527,
450
+ "grad_norm": 3.4785337448120117,
451
+ "learning_rate": 0.00016468401486988848,
452
+ "loss": 1.7184,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 0.3593556381660471,
457
+ "grad_norm": 2.730374336242676,
458
+ "learning_rate": 0.00016406443618339532,
459
+ "loss": 1.6898,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 0.3655514250309789,
464
+ "grad_norm": 3.3291196823120117,
465
+ "learning_rate": 0.0001634448574969021,
466
+ "loss": 1.6437,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 0.37174721189591076,
471
+ "grad_norm": 4.465322017669678,
472
+ "learning_rate": 0.0001628252788104089,
473
+ "loss": 1.6749,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 0.37174721189591076,
478
+ "eval_accuracy": 0.5786864931846345,
479
+ "eval_loss": 1.7369675636291504,
480
+ "eval_runtime": 106.0837,
481
+ "eval_samples_per_second": 60.858,
482
+ "eval_steps_per_second": 7.607,
483
+ "step": 600
484
+ },
485
+ {
486
+ "epoch": 0.37794299876084264,
487
+ "grad_norm": 4.176516056060791,
488
+ "learning_rate": 0.00016220570012391575,
489
+ "loss": 1.6242,
490
+ "step": 610
491
+ },
492
+ {
493
+ "epoch": 0.38413878562577447,
494
+ "grad_norm": 4.0990495681762695,
495
+ "learning_rate": 0.00016158612143742256,
496
+ "loss": 1.774,
497
+ "step": 620
498
+ },
499
+ {
500
+ "epoch": 0.3903345724907063,
501
+ "grad_norm": 5.12111234664917,
502
+ "learning_rate": 0.00016096654275092937,
503
+ "loss": 1.7558,
504
+ "step": 630
505
+ },
506
+ {
507
+ "epoch": 0.3965303593556382,
508
+ "grad_norm": 4.448328018188477,
509
+ "learning_rate": 0.00016034696406443618,
510
+ "loss": 1.6905,
511
+ "step": 640
512
+ },
513
+ {
514
+ "epoch": 0.40272614622057,
515
+ "grad_norm": 4.468796253204346,
516
+ "learning_rate": 0.00015972738537794301,
517
+ "loss": 1.5202,
518
+ "step": 650
519
+ },
520
+ {
521
+ "epoch": 0.40892193308550184,
522
+ "grad_norm": 4.141849517822266,
523
+ "learning_rate": 0.00015910780669144982,
524
+ "loss": 1.5273,
525
+ "step": 660
526
+ },
527
+ {
528
+ "epoch": 0.41511771995043373,
529
+ "grad_norm": 4.007165431976318,
530
+ "learning_rate": 0.00015848822800495664,
531
+ "loss": 1.5843,
532
+ "step": 670
533
+ },
534
+ {
535
+ "epoch": 0.42131350681536556,
536
+ "grad_norm": 3.8219428062438965,
537
+ "learning_rate": 0.00015786864931846345,
538
+ "loss": 1.6011,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 0.4275092936802974,
543
+ "grad_norm": 4.7010345458984375,
544
+ "learning_rate": 0.00015724907063197028,
545
+ "loss": 1.6389,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 0.43370508054522927,
550
+ "grad_norm": 3.79860258102417,
551
+ "learning_rate": 0.0001566294919454771,
552
+ "loss": 1.5852,
553
+ "step": 700
554
+ },
555
+ {
556
+ "epoch": 0.43370508054522927,
557
+ "eval_accuracy": 0.6438971499380421,
558
+ "eval_loss": 1.4947177171707153,
559
+ "eval_runtime": 106.0976,
560
+ "eval_samples_per_second": 60.85,
561
+ "eval_steps_per_second": 7.606,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 0.4399008674101611,
566
+ "grad_norm": 3.5789403915405273,
567
+ "learning_rate": 0.0001560099132589839,
568
+ "loss": 1.526,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 0.44609665427509293,
573
+ "grad_norm": 4.297870635986328,
574
+ "learning_rate": 0.0001553903345724907,
575
+ "loss": 1.5452,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 0.45229244114002476,
580
+ "grad_norm": 5.054442882537842,
581
+ "learning_rate": 0.00015477075588599752,
582
+ "loss": 1.3529,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 0.45848822800495664,
587
+ "grad_norm": 5.724175930023193,
588
+ "learning_rate": 0.00015415117719950433,
589
+ "loss": 1.4825,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 0.4646840148698885,
594
+ "grad_norm": 3.8260886669158936,
595
+ "learning_rate": 0.00015353159851301114,
596
+ "loss": 1.4692,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 0.4708798017348203,
601
+ "grad_norm": 3.232948064804077,
602
+ "learning_rate": 0.00015291201982651798,
603
+ "loss": 1.43,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 0.4770755885997522,
608
+ "grad_norm": 6.948119163513184,
609
+ "learning_rate": 0.0001522924411400248,
610
+ "loss": 1.5213,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 0.483271375464684,
615
+ "grad_norm": 5.678015232086182,
616
+ "learning_rate": 0.0001516728624535316,
617
+ "loss": 1.2681,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 0.48946716232961585,
622
+ "grad_norm": 3.9260432720184326,
623
+ "learning_rate": 0.0001510532837670384,
624
+ "loss": 1.4108,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 0.49566294919454773,
629
+ "grad_norm": 5.349125862121582,
630
+ "learning_rate": 0.00015043370508054525,
631
+ "loss": 1.1875,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 0.49566294919454773,
636
+ "eval_accuracy": 0.6468401486988847,
637
+ "eval_loss": 1.4151387214660645,
638
+ "eval_runtime": 105.6692,
639
+ "eval_samples_per_second": 61.096,
640
+ "eval_steps_per_second": 7.637,
641
+ "step": 800
642
+ },
643
+ {
644
+ "epoch": 0.5018587360594795,
645
+ "grad_norm": 4.983338832855225,
646
+ "learning_rate": 0.00014981412639405206,
647
+ "loss": 1.3196,
648
+ "step": 810
649
+ },
650
+ {
651
+ "epoch": 0.5080545229244114,
652
+ "grad_norm": 5.539896488189697,
653
+ "learning_rate": 0.00014919454770755887,
654
+ "loss": 1.4704,
655
+ "step": 820
656
+ },
657
+ {
658
+ "epoch": 0.5142503097893433,
659
+ "grad_norm": 4.018556594848633,
660
+ "learning_rate": 0.0001485749690210657,
661
+ "loss": 1.2208,
662
+ "step": 830
663
+ },
664
+ {
665
+ "epoch": 0.5204460966542751,
666
+ "grad_norm": 4.443668842315674,
667
+ "learning_rate": 0.00014795539033457251,
668
+ "loss": 1.2335,
669
+ "step": 840
670
+ },
671
+ {
672
+ "epoch": 0.5266418835192069,
673
+ "grad_norm": 6.135488033294678,
674
+ "learning_rate": 0.0001473358116480793,
675
+ "loss": 1.4955,
676
+ "step": 850
677
+ },
678
+ {
679
+ "epoch": 0.5328376703841388,
680
+ "grad_norm": 5.287814617156982,
681
+ "learning_rate": 0.0001467162329615861,
682
+ "loss": 1.2792,
683
+ "step": 860
684
+ },
685
+ {
686
+ "epoch": 0.5390334572490706,
687
+ "grad_norm": 5.035948276519775,
688
+ "learning_rate": 0.00014609665427509294,
689
+ "loss": 1.2946,
690
+ "step": 870
691
+ },
692
+ {
693
+ "epoch": 0.5452292441140025,
694
+ "grad_norm": 4.492195129394531,
695
+ "learning_rate": 0.00014547707558859975,
696
+ "loss": 1.2686,
697
+ "step": 880
698
+ },
699
+ {
700
+ "epoch": 0.5514250309789344,
701
+ "grad_norm": 4.308188438415527,
702
+ "learning_rate": 0.00014485749690210656,
703
+ "loss": 1.3539,
704
+ "step": 890
705
+ },
706
+ {
707
+ "epoch": 0.5576208178438662,
708
+ "grad_norm": 5.699028015136719,
709
+ "learning_rate": 0.00014423791821561337,
710
+ "loss": 1.5114,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 0.5576208178438662,
715
+ "eval_accuracy": 0.682001239157373,
716
+ "eval_loss": 1.2709109783172607,
717
+ "eval_runtime": 105.9622,
718
+ "eval_samples_per_second": 60.927,
719
+ "eval_steps_per_second": 7.616,
720
+ "step": 900
721
+ },
722
+ {
723
+ "epoch": 0.563816604708798,
724
+ "grad_norm": 3.6500446796417236,
725
+ "learning_rate": 0.0001436183395291202,
726
+ "loss": 1.1699,
727
+ "step": 910
728
+ },
729
+ {
730
+ "epoch": 0.5700123915737298,
731
+ "grad_norm": 5.26973819732666,
732
+ "learning_rate": 0.00014299876084262702,
733
+ "loss": 1.2285,
734
+ "step": 920
735
+ },
736
+ {
737
+ "epoch": 0.5762081784386617,
738
+ "grad_norm": 5.995537757873535,
739
+ "learning_rate": 0.00014237918215613383,
740
+ "loss": 1.2008,
741
+ "step": 930
742
+ },
743
+ {
744
+ "epoch": 0.5824039653035935,
745
+ "grad_norm": 4.208325386047363,
746
+ "learning_rate": 0.00014175960346964067,
747
+ "loss": 0.9493,
748
+ "step": 940
749
+ },
750
+ {
751
+ "epoch": 0.5885997521685254,
752
+ "grad_norm": 4.716500282287598,
753
+ "learning_rate": 0.00014114002478314748,
754
+ "loss": 1.1492,
755
+ "step": 950
756
+ },
757
+ {
758
+ "epoch": 0.5947955390334573,
759
+ "grad_norm": 3.3192636966705322,
760
+ "learning_rate": 0.0001405204460966543,
761
+ "loss": 1.3701,
762
+ "step": 960
763
+ },
764
+ {
765
+ "epoch": 0.6009913258983891,
766
+ "grad_norm": 4.523627758026123,
767
+ "learning_rate": 0.0001399008674101611,
768
+ "loss": 1.1221,
769
+ "step": 970
770
+ },
771
+ {
772
+ "epoch": 0.6071871127633209,
773
+ "grad_norm": 4.424323081970215,
774
+ "learning_rate": 0.0001392812887236679,
775
+ "loss": 1.2023,
776
+ "step": 980
777
+ },
778
+ {
779
+ "epoch": 0.6133828996282528,
780
+ "grad_norm": 5.295013427734375,
781
+ "learning_rate": 0.00013866171003717472,
782
+ "loss": 1.0532,
783
+ "step": 990
784
+ },
785
+ {
786
+ "epoch": 0.6195786864931846,
787
+ "grad_norm": 4.994614124298096,
788
+ "learning_rate": 0.00013804213135068153,
789
+ "loss": 1.3122,
790
+ "step": 1000
791
+ },
792
+ {
793
+ "epoch": 0.6195786864931846,
794
+ "eval_accuracy": 0.6939281288723668,
795
+ "eval_loss": 1.1940184831619263,
796
+ "eval_runtime": 106.4084,
797
+ "eval_samples_per_second": 60.672,
798
+ "eval_steps_per_second": 7.584,
799
+ "step": 1000
800
+ },
801
+ {
802
+ "epoch": 0.6257744733581165,
803
+ "grad_norm": 2.611729145050049,
804
+ "learning_rate": 0.00013742255266418837,
805
+ "loss": 0.9277,
806
+ "step": 1010
807
+ },
808
+ {
809
+ "epoch": 0.6319702602230484,
810
+ "grad_norm": 2.6009323596954346,
811
+ "learning_rate": 0.00013680297397769518,
812
+ "loss": 1.0508,
813
+ "step": 1020
814
+ },
815
+ {
816
+ "epoch": 0.6381660470879802,
817
+ "grad_norm": 3.2199230194091797,
818
+ "learning_rate": 0.00013618339529120199,
819
+ "loss": 1.11,
820
+ "step": 1030
821
+ },
822
+ {
823
+ "epoch": 0.644361833952912,
824
+ "grad_norm": 2.762774705886841,
825
+ "learning_rate": 0.0001355638166047088,
826
+ "loss": 1.0821,
827
+ "step": 1040
828
+ },
829
+ {
830
+ "epoch": 0.6505576208178439,
831
+ "grad_norm": 3.8875350952148438,
832
+ "learning_rate": 0.00013494423791821563,
833
+ "loss": 1.0406,
834
+ "step": 1050
835
+ },
836
+ {
837
+ "epoch": 0.6567534076827757,
838
+ "grad_norm": 4.314332008361816,
839
+ "learning_rate": 0.00013432465923172244,
840
+ "loss": 1.1308,
841
+ "step": 1060
842
+ },
843
+ {
844
+ "epoch": 0.6629491945477075,
845
+ "grad_norm": 3.6009325981140137,
846
+ "learning_rate": 0.00013370508054522925,
847
+ "loss": 1.1907,
848
+ "step": 1070
849
+ },
850
+ {
851
+ "epoch": 0.6691449814126395,
852
+ "grad_norm": 5.977869033813477,
853
+ "learning_rate": 0.00013308550185873606,
854
+ "loss": 1.1071,
855
+ "step": 1080
856
+ },
857
+ {
858
+ "epoch": 0.6753407682775713,
859
+ "grad_norm": 5.390667915344238,
860
+ "learning_rate": 0.0001324659231722429,
861
+ "loss": 1.0829,
862
+ "step": 1090
863
+ },
864
+ {
865
+ "epoch": 0.6815365551425031,
866
+ "grad_norm": 6.445249080657959,
867
+ "learning_rate": 0.0001318463444857497,
868
+ "loss": 1.0721,
869
+ "step": 1100
870
+ },
871
+ {
872
+ "epoch": 0.6815365551425031,
873
+ "eval_accuracy": 0.7261462205700124,
874
+ "eval_loss": 1.0756527185440063,
875
+ "eval_runtime": 104.6325,
876
+ "eval_samples_per_second": 61.702,
877
+ "eval_steps_per_second": 7.713,
878
+ "step": 1100
879
+ },
880
+ {
881
+ "epoch": 0.6877323420074349,
882
+ "grad_norm": 3.1488468647003174,
883
+ "learning_rate": 0.0001312267657992565,
884
+ "loss": 0.9928,
885
+ "step": 1110
886
+ },
887
+ {
888
+ "epoch": 0.6939281288723668,
889
+ "grad_norm": 3.6752538681030273,
890
+ "learning_rate": 0.00013060718711276333,
891
+ "loss": 0.8412,
892
+ "step": 1120
893
+ },
894
+ {
895
+ "epoch": 0.7001239157372986,
896
+ "grad_norm": 6.186413764953613,
897
+ "learning_rate": 0.00012998760842627014,
898
+ "loss": 0.8979,
899
+ "step": 1130
900
+ },
901
+ {
902
+ "epoch": 0.7063197026022305,
903
+ "grad_norm": 4.457529544830322,
904
+ "learning_rate": 0.00012936802973977695,
905
+ "loss": 1.0586,
906
+ "step": 1140
907
+ },
908
+ {
909
+ "epoch": 0.7125154894671624,
910
+ "grad_norm": 3.7016208171844482,
911
+ "learning_rate": 0.00012874845105328376,
912
+ "loss": 0.7632,
913
+ "step": 1150
914
+ },
915
+ {
916
+ "epoch": 0.7187112763320942,
917
+ "grad_norm": 3.913440227508545,
918
+ "learning_rate": 0.0001281288723667906,
919
+ "loss": 0.9905,
920
+ "step": 1160
921
+ },
922
+ {
923
+ "epoch": 0.724907063197026,
924
+ "grad_norm": 4.720458984375,
925
+ "learning_rate": 0.0001275092936802974,
926
+ "loss": 0.8703,
927
+ "step": 1170
928
+ },
929
+ {
930
+ "epoch": 0.7311028500619579,
931
+ "grad_norm": 4.232792854309082,
932
+ "learning_rate": 0.00012688971499380422,
933
+ "loss": 1.2989,
934
+ "step": 1180
935
+ },
936
+ {
937
+ "epoch": 0.7372986369268897,
938
+ "grad_norm": 5.886707305908203,
939
+ "learning_rate": 0.00012627013630731103,
940
+ "loss": 1.1108,
941
+ "step": 1190
942
+ },
943
+ {
944
+ "epoch": 0.7434944237918215,
945
+ "grad_norm": 3.9497408866882324,
946
+ "learning_rate": 0.00012565055762081787,
947
+ "loss": 0.8249,
948
+ "step": 1200
949
+ },
950
+ {
951
+ "epoch": 0.7434944237918215,
952
+ "eval_accuracy": 0.7575898389095415,
953
+ "eval_loss": 0.9666171669960022,
954
+ "eval_runtime": 106.5314,
955
+ "eval_samples_per_second": 60.602,
956
+ "eval_steps_per_second": 7.575,
957
+ "step": 1200
958
+ },
959
+ {
960
+ "epoch": 0.7496902106567535,
961
+ "grad_norm": 5.659217357635498,
962
+ "learning_rate": 0.00012503097893432468,
963
+ "loss": 0.8484,
964
+ "step": 1210
965
+ },
966
+ {
967
+ "epoch": 0.7558859975216853,
968
+ "grad_norm": 4.372608661651611,
969
+ "learning_rate": 0.00012441140024783149,
970
+ "loss": 0.9891,
971
+ "step": 1220
972
+ },
973
+ {
974
+ "epoch": 0.7620817843866171,
975
+ "grad_norm": 7.296494007110596,
976
+ "learning_rate": 0.0001237918215613383,
977
+ "loss": 0.919,
978
+ "step": 1230
979
+ },
980
+ {
981
+ "epoch": 0.7682775712515489,
982
+ "grad_norm": 2.1547601222991943,
983
+ "learning_rate": 0.0001231722428748451,
984
+ "loss": 0.8611,
985
+ "step": 1240
986
+ },
987
+ {
988
+ "epoch": 0.7744733581164808,
989
+ "grad_norm": 4.57709264755249,
990
+ "learning_rate": 0.00012255266418835192,
991
+ "loss": 0.893,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 0.7806691449814126,
996
+ "grad_norm": 4.922760009765625,
997
+ "learning_rate": 0.00012193308550185874,
998
+ "loss": 0.9478,
999
+ "step": 1260
1000
+ },
1001
+ {
1002
+ "epoch": 0.7868649318463445,
1003
+ "grad_norm": 4.814465045928955,
1004
+ "learning_rate": 0.00012131350681536555,
1005
+ "loss": 0.9416,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 0.7930607187112764,
1010
+ "grad_norm": 5.223793029785156,
1011
+ "learning_rate": 0.00012069392812887237,
1012
+ "loss": 0.7468,
1013
+ "step": 1280
1014
+ },
1015
+ {
1016
+ "epoch": 0.7992565055762082,
1017
+ "grad_norm": 7.333277702331543,
1018
+ "learning_rate": 0.00012007434944237918,
1019
+ "loss": 0.8581,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 0.80545229244114,
1024
+ "grad_norm": 2.8909173011779785,
1025
+ "learning_rate": 0.000119454770755886,
1026
+ "loss": 0.7944,
1027
+ "step": 1300
1028
+ },
1029
+ {
1030
+ "epoch": 0.80545229244114,
1031
+ "eval_accuracy": 0.7707558859975217,
1032
+ "eval_loss": 0.9101163744926453,
1033
+ "eval_runtime": 106.3936,
1034
+ "eval_samples_per_second": 60.68,
1035
+ "eval_steps_per_second": 7.585,
1036
+ "step": 1300
1037
+ },
1038
+ {
1039
+ "epoch": 0.8116480793060719,
1040
+ "grad_norm": 5.546220302581787,
1041
+ "learning_rate": 0.00011883519206939282,
1042
+ "loss": 1.066,
1043
+ "step": 1310
1044
+ },
1045
+ {
1046
+ "epoch": 0.8178438661710037,
1047
+ "grad_norm": 5.582309246063232,
1048
+ "learning_rate": 0.00011821561338289964,
1049
+ "loss": 0.7992,
1050
+ "step": 1320
1051
+ },
1052
+ {
1053
+ "epoch": 0.8240396530359355,
1054
+ "grad_norm": 6.598534107208252,
1055
+ "learning_rate": 0.00011759603469640645,
1056
+ "loss": 0.8142,
1057
+ "step": 1330
1058
+ },
1059
+ {
1060
+ "epoch": 0.8302354399008675,
1061
+ "grad_norm": 3.366227865219116,
1062
+ "learning_rate": 0.00011697645600991327,
1063
+ "loss": 0.8297,
1064
+ "step": 1340
1065
+ },
1066
+ {
1067
+ "epoch": 0.8364312267657993,
1068
+ "grad_norm": 4.621030807495117,
1069
+ "learning_rate": 0.00011635687732342008,
1070
+ "loss": 0.8083,
1071
+ "step": 1350
1072
+ },
1073
+ {
1074
+ "epoch": 0.8426270136307311,
1075
+ "grad_norm": 4.285297870635986,
1076
+ "learning_rate": 0.00011573729863692691,
1077
+ "loss": 0.8572,
1078
+ "step": 1360
1079
+ },
1080
+ {
1081
+ "epoch": 0.8488228004956629,
1082
+ "grad_norm": 5.127432823181152,
1083
+ "learning_rate": 0.0001151177199504337,
1084
+ "loss": 0.7723,
1085
+ "step": 1370
1086
+ },
1087
+ {
1088
+ "epoch": 0.8550185873605948,
1089
+ "grad_norm": 8.046000480651855,
1090
+ "learning_rate": 0.00011449814126394051,
1091
+ "loss": 0.6129,
1092
+ "step": 1380
1093
+ },
1094
+ {
1095
+ "epoch": 0.8612143742255266,
1096
+ "grad_norm": 3.8149867057800293,
1097
+ "learning_rate": 0.00011387856257744734,
1098
+ "loss": 0.7718,
1099
+ "step": 1390
1100
+ },
1101
+ {
1102
+ "epoch": 0.8674101610904585,
1103
+ "grad_norm": 7.305781364440918,
1104
+ "learning_rate": 0.00011325898389095415,
1105
+ "loss": 0.8032,
1106
+ "step": 1400
1107
+ },
1108
+ {
1109
+ "epoch": 0.8674101610904585,
1110
+ "eval_accuracy": 0.7690520446096655,
1111
+ "eval_loss": 0.901136040687561,
1112
+ "eval_runtime": 106.4417,
1113
+ "eval_samples_per_second": 60.653,
1114
+ "eval_steps_per_second": 7.582,
1115
+ "step": 1400
1116
+ },
1117
+ {
1118
+ "epoch": 0.8736059479553904,
1119
+ "grad_norm": 5.850032329559326,
1120
+ "learning_rate": 0.00011263940520446097,
1121
+ "loss": 0.9454,
1122
+ "step": 1410
1123
+ },
1124
+ {
1125
+ "epoch": 0.8798017348203222,
1126
+ "grad_norm": 5.336400032043457,
1127
+ "learning_rate": 0.00011201982651796778,
1128
+ "loss": 0.7002,
1129
+ "step": 1420
1130
+ },
1131
+ {
1132
+ "epoch": 0.885997521685254,
1133
+ "grad_norm": 3.1872918605804443,
1134
+ "learning_rate": 0.0001114002478314746,
1135
+ "loss": 0.8848,
1136
+ "step": 1430
1137
+ },
1138
+ {
1139
+ "epoch": 0.8921933085501859,
1140
+ "grad_norm": 5.940222263336182,
1141
+ "learning_rate": 0.00011078066914498142,
1142
+ "loss": 0.8746,
1143
+ "step": 1440
1144
+ },
1145
+ {
1146
+ "epoch": 0.8983890954151177,
1147
+ "grad_norm": 4.074731349945068,
1148
+ "learning_rate": 0.00011016109045848824,
1149
+ "loss": 0.9426,
1150
+ "step": 1450
1151
+ },
1152
+ {
1153
+ "epoch": 0.9045848822800495,
1154
+ "grad_norm": 4.467647552490234,
1155
+ "learning_rate": 0.00010954151177199505,
1156
+ "loss": 0.7319,
1157
+ "step": 1460
1158
+ },
1159
+ {
1160
+ "epoch": 0.9107806691449815,
1161
+ "grad_norm": 4.298549175262451,
1162
+ "learning_rate": 0.00010892193308550187,
1163
+ "loss": 0.7648,
1164
+ "step": 1470
1165
+ },
1166
+ {
1167
+ "epoch": 0.9169764560099133,
1168
+ "grad_norm": 5.923393249511719,
1169
+ "learning_rate": 0.00010830235439900868,
1170
+ "loss": 0.66,
1171
+ "step": 1480
1172
+ },
1173
+ {
1174
+ "epoch": 0.9231722428748451,
1175
+ "grad_norm": 3.252465009689331,
1176
+ "learning_rate": 0.0001076827757125155,
1177
+ "loss": 0.6732,
1178
+ "step": 1490
1179
+ },
1180
+ {
1181
+ "epoch": 0.929368029739777,
1182
+ "grad_norm": 5.450772285461426,
1183
+ "learning_rate": 0.0001070631970260223,
1184
+ "loss": 0.7479,
1185
+ "step": 1500
1186
+ },
1187
+ {
1188
+ "epoch": 0.929368029739777,
1189
+ "eval_accuracy": 0.8066914498141264,
1190
+ "eval_loss": 0.7409122586250305,
1191
+ "eval_runtime": 106.9474,
1192
+ "eval_samples_per_second": 60.366,
1193
+ "eval_steps_per_second": 7.546,
1194
+ "step": 1500
1195
+ },
1196
+ {
1197
+ "epoch": 0.9355638166047088,
1198
+ "grad_norm": 3.8321099281311035,
1199
+ "learning_rate": 0.00010644361833952911,
1200
+ "loss": 0.6568,
1201
+ "step": 1510
1202
+ },
1203
+ {
1204
+ "epoch": 0.9417596034696406,
1205
+ "grad_norm": 2.7507283687591553,
1206
+ "learning_rate": 0.00010582403965303594,
1207
+ "loss": 0.7641,
1208
+ "step": 1520
1209
+ },
1210
+ {
1211
+ "epoch": 0.9479553903345725,
1212
+ "grad_norm": 4.787430286407471,
1213
+ "learning_rate": 0.00010520446096654275,
1214
+ "loss": 0.8192,
1215
+ "step": 1530
1216
+ },
1217
+ {
1218
+ "epoch": 0.9541511771995044,
1219
+ "grad_norm": 5.063214302062988,
1220
+ "learning_rate": 0.00010458488228004957,
1221
+ "loss": 0.7953,
1222
+ "step": 1540
1223
+ },
1224
+ {
1225
+ "epoch": 0.9603469640644362,
1226
+ "grad_norm": 3.0914242267608643,
1227
+ "learning_rate": 0.00010396530359355638,
1228
+ "loss": 0.6188,
1229
+ "step": 1550
1230
+ },
1231
+ {
1232
+ "epoch": 0.966542750929368,
1233
+ "grad_norm": 2.4344420433044434,
1234
+ "learning_rate": 0.0001033457249070632,
1235
+ "loss": 0.7707,
1236
+ "step": 1560
1237
+ },
1238
+ {
1239
+ "epoch": 0.9727385377942999,
1240
+ "grad_norm": 5.563531398773193,
1241
+ "learning_rate": 0.00010272614622057001,
1242
+ "loss": 0.657,
1243
+ "step": 1570
1244
+ },
1245
+ {
1246
+ "epoch": 0.9789343246592317,
1247
+ "grad_norm": 2.2125167846679688,
1248
+ "learning_rate": 0.00010210656753407684,
1249
+ "loss": 0.7362,
1250
+ "step": 1580
1251
+ },
1252
+ {
1253
+ "epoch": 0.9851301115241635,
1254
+ "grad_norm": 7.253428936004639,
1255
+ "learning_rate": 0.00010148698884758365,
1256
+ "loss": 0.974,
1257
+ "step": 1590
1258
+ },
1259
+ {
1260
+ "epoch": 0.9913258983890955,
1261
+ "grad_norm": 5.762598037719727,
1262
+ "learning_rate": 0.00010086741016109047,
1263
+ "loss": 0.5997,
1264
+ "step": 1600
1265
+ },
1266
+ {
1267
+ "epoch": 0.9913258983890955,
1268
+ "eval_accuracy": 0.8110285006195787,
1269
+ "eval_loss": 0.7325805425643921,
1270
+ "eval_runtime": 105.9884,
1271
+ "eval_samples_per_second": 60.912,
1272
+ "eval_steps_per_second": 7.614,
1273
+ "step": 1600
1274
+ },
1275
+ {
1276
+ "epoch": 0.9975216852540273,
1277
+ "grad_norm": 7.013967037200928,
1278
+ "learning_rate": 0.00010024783147459728,
1279
+ "loss": 0.7562,
1280
+ "step": 1610
1281
+ },
1282
+ {
1283
+ "epoch": 1.003717472118959,
1284
+ "grad_norm": 4.252784252166748,
1285
+ "learning_rate": 9.962825278810409e-05,
1286
+ "loss": 0.5769,
1287
+ "step": 1620
1288
+ },
1289
+ {
1290
+ "epoch": 1.009913258983891,
1291
+ "grad_norm": 2.1795663833618164,
1292
+ "learning_rate": 9.900867410161091e-05,
1293
+ "loss": 0.4631,
1294
+ "step": 1630
1295
+ },
1296
+ {
1297
+ "epoch": 1.016109045848823,
1298
+ "grad_norm": 2.469095468521118,
1299
+ "learning_rate": 9.838909541511772e-05,
1300
+ "loss": 0.3613,
1301
+ "step": 1640
1302
+ },
1303
+ {
1304
+ "epoch": 1.0223048327137547,
1305
+ "grad_norm": 4.4682297706604,
1306
+ "learning_rate": 9.776951672862455e-05,
1307
+ "loss": 0.4184,
1308
+ "step": 1650
1309
+ },
1310
+ {
1311
+ "epoch": 1.0285006195786865,
1312
+ "grad_norm": 1.6269049644470215,
1313
+ "learning_rate": 9.714993804213134e-05,
1314
+ "loss": 0.3376,
1315
+ "step": 1660
1316
+ },
1317
+ {
1318
+ "epoch": 1.0346964064436184,
1319
+ "grad_norm": 4.2464423179626465,
1320
+ "learning_rate": 9.653035935563817e-05,
1321
+ "loss": 0.4305,
1322
+ "step": 1670
1323
+ },
1324
+ {
1325
+ "epoch": 1.0408921933085502,
1326
+ "grad_norm": 3.627943992614746,
1327
+ "learning_rate": 9.591078066914498e-05,
1328
+ "loss": 0.3696,
1329
+ "step": 1680
1330
+ },
1331
+ {
1332
+ "epoch": 1.047087980173482,
1333
+ "grad_norm": 2.6817383766174316,
1334
+ "learning_rate": 9.52912019826518e-05,
1335
+ "loss": 0.3897,
1336
+ "step": 1690
1337
+ },
1338
+ {
1339
+ "epoch": 1.0532837670384139,
1340
+ "grad_norm": 7.518842697143555,
1341
+ "learning_rate": 9.467162329615861e-05,
1342
+ "loss": 0.5005,
1343
+ "step": 1700
1344
+ },
1345
+ {
1346
+ "epoch": 1.0532837670384139,
1347
+ "eval_accuracy": 0.8210966542750929,
1348
+ "eval_loss": 0.6769081950187683,
1349
+ "eval_runtime": 105.6921,
1350
+ "eval_samples_per_second": 61.083,
1351
+ "eval_steps_per_second": 7.635,
1352
+ "step": 1700
1353
+ },
1354
+ {
1355
+ "epoch": 1.0594795539033457,
1356
+ "grad_norm": 3.0254740715026855,
1357
+ "learning_rate": 9.405204460966544e-05,
1358
+ "loss": 0.3827,
1359
+ "step": 1710
1360
+ },
1361
+ {
1362
+ "epoch": 1.0656753407682775,
1363
+ "grad_norm": 4.391673564910889,
1364
+ "learning_rate": 9.343246592317225e-05,
1365
+ "loss": 0.3535,
1366
+ "step": 1720
1367
+ },
1368
+ {
1369
+ "epoch": 1.0718711276332094,
1370
+ "grad_norm": 3.73157000541687,
1371
+ "learning_rate": 9.281288723667906e-05,
1372
+ "loss": 0.3238,
1373
+ "step": 1730
1374
+ },
1375
+ {
1376
+ "epoch": 1.0780669144981412,
1377
+ "grad_norm": 2.160573720932007,
1378
+ "learning_rate": 9.219330855018588e-05,
1379
+ "loss": 0.383,
1380
+ "step": 1740
1381
+ },
1382
+ {
1383
+ "epoch": 1.084262701363073,
1384
+ "grad_norm": 4.27864408493042,
1385
+ "learning_rate": 9.157372986369269e-05,
1386
+ "loss": 0.3361,
1387
+ "step": 1750
1388
+ },
1389
+ {
1390
+ "epoch": 1.090458488228005,
1391
+ "grad_norm": 3.1258535385131836,
1392
+ "learning_rate": 9.095415117719951e-05,
1393
+ "loss": 0.2797,
1394
+ "step": 1760
1395
+ },
1396
+ {
1397
+ "epoch": 1.096654275092937,
1398
+ "grad_norm": 3.8895909786224365,
1399
+ "learning_rate": 9.033457249070632e-05,
1400
+ "loss": 0.4367,
1401
+ "step": 1770
1402
+ },
1403
+ {
1404
+ "epoch": 1.1028500619578687,
1405
+ "grad_norm": 2.674630880355835,
1406
+ "learning_rate": 8.971499380421315e-05,
1407
+ "loss": 0.3925,
1408
+ "step": 1780
1409
+ },
1410
+ {
1411
+ "epoch": 1.1090458488228006,
1412
+ "grad_norm": 7.267265319824219,
1413
+ "learning_rate": 8.909541511771994e-05,
1414
+ "loss": 0.4627,
1415
+ "step": 1790
1416
+ },
1417
+ {
1418
+ "epoch": 1.1152416356877324,
1419
+ "grad_norm": 4.650302886962891,
1420
+ "learning_rate": 8.847583643122677e-05,
1421
+ "loss": 0.4107,
1422
+ "step": 1800
1423
+ },
1424
+ {
1425
+ "epoch": 1.1152416356877324,
1426
+ "eval_accuracy": 0.837360594795539,
1427
+ "eval_loss": 0.6374781131744385,
1428
+ "eval_runtime": 106.1067,
1429
+ "eval_samples_per_second": 60.844,
1430
+ "eval_steps_per_second": 7.606,
1431
+ "step": 1800
1432
+ },
1433
+ {
1434
+ "epoch": 1.1214374225526642,
1435
+ "grad_norm": 2.1244664192199707,
1436
+ "learning_rate": 8.785625774473358e-05,
1437
+ "loss": 0.3463,
1438
+ "step": 1810
1439
+ },
1440
+ {
1441
+ "epoch": 1.127633209417596,
1442
+ "grad_norm": 1.8372740745544434,
1443
+ "learning_rate": 8.72366790582404e-05,
1444
+ "loss": 0.3116,
1445
+ "step": 1820
1446
+ },
1447
+ {
1448
+ "epoch": 1.1338289962825279,
1449
+ "grad_norm": 3.6969428062438965,
1450
+ "learning_rate": 8.661710037174722e-05,
1451
+ "loss": 0.4069,
1452
+ "step": 1830
1453
+ },
1454
+ {
1455
+ "epoch": 1.1400247831474597,
1456
+ "grad_norm": 3.857111930847168,
1457
+ "learning_rate": 8.599752168525403e-05,
1458
+ "loss": 0.4478,
1459
+ "step": 1840
1460
+ },
1461
+ {
1462
+ "epoch": 1.1462205700123915,
1463
+ "grad_norm": 2.005557060241699,
1464
+ "learning_rate": 8.537794299876086e-05,
1465
+ "loss": 0.2794,
1466
+ "step": 1850
1467
+ },
1468
+ {
1469
+ "epoch": 1.1524163568773234,
1470
+ "grad_norm": 5.883118629455566,
1471
+ "learning_rate": 8.475836431226765e-05,
1472
+ "loss": 0.3732,
1473
+ "step": 1860
1474
+ },
1475
+ {
1476
+ "epoch": 1.1586121437422552,
1477
+ "grad_norm": 5.240428924560547,
1478
+ "learning_rate": 8.413878562577448e-05,
1479
+ "loss": 0.3042,
1480
+ "step": 1870
1481
+ },
1482
+ {
1483
+ "epoch": 1.164807930607187,
1484
+ "grad_norm": 2.857640027999878,
1485
+ "learning_rate": 8.351920693928129e-05,
1486
+ "loss": 0.3013,
1487
+ "step": 1880
1488
+ },
1489
+ {
1490
+ "epoch": 1.1710037174721188,
1491
+ "grad_norm": 5.086670398712158,
1492
+ "learning_rate": 8.289962825278811e-05,
1493
+ "loss": 0.2786,
1494
+ "step": 1890
1495
+ },
1496
+ {
1497
+ "epoch": 1.177199504337051,
1498
+ "grad_norm": 4.9353413581848145,
1499
+ "learning_rate": 8.228004956629492e-05,
1500
+ "loss": 0.4596,
1501
+ "step": 1900
1502
+ },
1503
+ {
1504
+ "epoch": 1.177199504337051,
1505
+ "eval_accuracy": 0.8303903345724907,
1506
+ "eval_loss": 0.6301799416542053,
1507
+ "eval_runtime": 106.1215,
1508
+ "eval_samples_per_second": 60.836,
1509
+ "eval_steps_per_second": 7.604,
1510
+ "step": 1900
1511
+ },
1512
+ {
1513
+ "epoch": 1.1833952912019827,
1514
+ "grad_norm": 3.3738842010498047,
1515
+ "learning_rate": 8.166047087980174e-05,
1516
+ "loss": 0.2521,
1517
+ "step": 1910
1518
+ },
1519
+ {
1520
+ "epoch": 1.1895910780669146,
1521
+ "grad_norm": 3.060638427734375,
1522
+ "learning_rate": 8.104089219330855e-05,
1523
+ "loss": 0.3277,
1524
+ "step": 1920
1525
+ },
1526
+ {
1527
+ "epoch": 1.1957868649318464,
1528
+ "grad_norm": 3.8044564723968506,
1529
+ "learning_rate": 8.042131350681536e-05,
1530
+ "loss": 0.3403,
1531
+ "step": 1930
1532
+ },
1533
+ {
1534
+ "epoch": 1.2019826517967782,
1535
+ "grad_norm": 4.681379795074463,
1536
+ "learning_rate": 7.980173482032219e-05,
1537
+ "loss": 0.3816,
1538
+ "step": 1940
1539
+ },
1540
+ {
1541
+ "epoch": 1.20817843866171,
1542
+ "grad_norm": 5.634707450866699,
1543
+ "learning_rate": 7.9182156133829e-05,
1544
+ "loss": 0.3161,
1545
+ "step": 1950
1546
+ },
1547
+ {
1548
+ "epoch": 1.2143742255266419,
1549
+ "grad_norm": 2.39424204826355,
1550
+ "learning_rate": 7.856257744733582e-05,
1551
+ "loss": 0.3638,
1552
+ "step": 1960
1553
+ },
1554
+ {
1555
+ "epoch": 1.2205700123915737,
1556
+ "grad_norm": 2.7920138835906982,
1557
+ "learning_rate": 7.794299876084263e-05,
1558
+ "loss": 0.2685,
1559
+ "step": 1970
1560
+ },
1561
+ {
1562
+ "epoch": 1.2267657992565055,
1563
+ "grad_norm": 6.918692111968994,
1564
+ "learning_rate": 7.732342007434946e-05,
1565
+ "loss": 0.2768,
1566
+ "step": 1980
1567
+ },
1568
+ {
1569
+ "epoch": 1.2329615861214374,
1570
+ "grad_norm": 4.783863544464111,
1571
+ "learning_rate": 7.670384138785625e-05,
1572
+ "loss": 0.3513,
1573
+ "step": 1990
1574
+ },
1575
+ {
1576
+ "epoch": 1.2391573729863692,
1577
+ "grad_norm": 1.5945993661880493,
1578
+ "learning_rate": 7.608426270136308e-05,
1579
+ "loss": 0.2544,
1580
+ "step": 2000
1581
+ },
1582
+ {
1583
+ "epoch": 1.2391573729863692,
1584
+ "eval_accuracy": 0.8399938042131351,
1585
+ "eval_loss": 0.5804997086524963,
1586
+ "eval_runtime": 106.7403,
1587
+ "eval_samples_per_second": 60.483,
1588
+ "eval_steps_per_second": 7.56,
1589
+ "step": 2000
1590
+ },
1591
+ {
1592
+ "epoch": 1.2453531598513012,
1593
+ "grad_norm": 0.8452507853507996,
1594
+ "learning_rate": 7.546468401486989e-05,
1595
+ "loss": 0.2581,
1596
+ "step": 2010
1597
+ },
1598
+ {
1599
+ "epoch": 1.251548946716233,
1600
+ "grad_norm": 3.4717066287994385,
1601
+ "learning_rate": 7.484510532837671e-05,
1602
+ "loss": 0.2793,
1603
+ "step": 2020
1604
+ },
1605
+ {
1606
+ "epoch": 1.257744733581165,
1607
+ "grad_norm": 3.5942156314849854,
1608
+ "learning_rate": 7.422552664188352e-05,
1609
+ "loss": 0.2798,
1610
+ "step": 2030
1611
+ },
1612
+ {
1613
+ "epoch": 1.2639405204460967,
1614
+ "grad_norm": 5.311221599578857,
1615
+ "learning_rate": 7.360594795539034e-05,
1616
+ "loss": 0.3501,
1617
+ "step": 2040
1618
+ },
1619
+ {
1620
+ "epoch": 1.2701363073110286,
1621
+ "grad_norm": 3.8793325424194336,
1622
+ "learning_rate": 7.298636926889715e-05,
1623
+ "loss": 0.3645,
1624
+ "step": 2050
1625
+ },
1626
+ {
1627
+ "epoch": 1.2763320941759604,
1628
+ "grad_norm": 2.781317949295044,
1629
+ "learning_rate": 7.236679058240396e-05,
1630
+ "loss": 0.3294,
1631
+ "step": 2060
1632
+ },
1633
+ {
1634
+ "epoch": 1.2825278810408922,
1635
+ "grad_norm": 1.0684071779251099,
1636
+ "learning_rate": 7.174721189591079e-05,
1637
+ "loss": 0.2507,
1638
+ "step": 2070
1639
+ },
1640
+ {
1641
+ "epoch": 1.288723667905824,
1642
+ "grad_norm": 1.6585029363632202,
1643
+ "learning_rate": 7.11276332094176e-05,
1644
+ "loss": 0.3451,
1645
+ "step": 2080
1646
+ },
1647
+ {
1648
+ "epoch": 1.2949194547707559,
1649
+ "grad_norm": 1.9376587867736816,
1650
+ "learning_rate": 7.050805452292442e-05,
1651
+ "loss": 0.3423,
1652
+ "step": 2090
1653
+ },
1654
+ {
1655
+ "epoch": 1.3011152416356877,
1656
+ "grad_norm": 2.892873525619507,
1657
+ "learning_rate": 6.988847583643123e-05,
1658
+ "loss": 0.2983,
1659
+ "step": 2100
1660
+ },
1661
+ {
1662
+ "epoch": 1.3011152416356877,
1663
+ "eval_accuracy": 0.8500619578686494,
1664
+ "eval_loss": 0.5480403304100037,
1665
+ "eval_runtime": 107.2113,
1666
+ "eval_samples_per_second": 60.218,
1667
+ "eval_steps_per_second": 7.527,
1668
+ "step": 2100
1669
+ },
1670
+ {
1671
+ "epoch": 1.3073110285006195,
1672
+ "grad_norm": 0.8831340670585632,
1673
+ "learning_rate": 6.926889714993805e-05,
1674
+ "loss": 0.3744,
1675
+ "step": 2110
1676
+ },
1677
+ {
1678
+ "epoch": 1.3135068153655514,
1679
+ "grad_norm": 5.404819011688232,
1680
+ "learning_rate": 6.864931846344485e-05,
1681
+ "loss": 0.2856,
1682
+ "step": 2120
1683
+ },
1684
+ {
1685
+ "epoch": 1.3197026022304832,
1686
+ "grad_norm": 5.324650287628174,
1687
+ "learning_rate": 6.802973977695167e-05,
1688
+ "loss": 0.2055,
1689
+ "step": 2130
1690
+ },
1691
+ {
1692
+ "epoch": 1.325898389095415,
1693
+ "grad_norm": 1.5021191835403442,
1694
+ "learning_rate": 6.741016109045848e-05,
1695
+ "loss": 0.3426,
1696
+ "step": 2140
1697
+ },
1698
+ {
1699
+ "epoch": 1.3320941759603468,
1700
+ "grad_norm": 3.320554256439209,
1701
+ "learning_rate": 6.679058240396531e-05,
1702
+ "loss": 0.1865,
1703
+ "step": 2150
1704
+ },
1705
+ {
1706
+ "epoch": 1.3382899628252787,
1707
+ "grad_norm": 6.635782241821289,
1708
+ "learning_rate": 6.617100371747212e-05,
1709
+ "loss": 0.2956,
1710
+ "step": 2160
1711
+ },
1712
+ {
1713
+ "epoch": 1.3444857496902107,
1714
+ "grad_norm": 7.695481300354004,
1715
+ "learning_rate": 6.555142503097894e-05,
1716
+ "loss": 0.2729,
1717
+ "step": 2170
1718
+ },
1719
+ {
1720
+ "epoch": 1.3506815365551426,
1721
+ "grad_norm": 2.3640730381011963,
1722
+ "learning_rate": 6.493184634448575e-05,
1723
+ "loss": 0.2946,
1724
+ "step": 2180
1725
+ },
1726
+ {
1727
+ "epoch": 1.3568773234200744,
1728
+ "grad_norm": 6.867854595184326,
1729
+ "learning_rate": 6.431226765799256e-05,
1730
+ "loss": 0.269,
1731
+ "step": 2190
1732
+ },
1733
+ {
1734
+ "epoch": 1.3630731102850062,
1735
+ "grad_norm": 5.83229923248291,
1736
+ "learning_rate": 6.369268897149939e-05,
1737
+ "loss": 0.3214,
1738
+ "step": 2200
1739
+ },
1740
+ {
1741
+ "epoch": 1.3630731102850062,
1742
+ "eval_accuracy": 0.8683395291201983,
1743
+ "eval_loss": 0.5052544474601746,
1744
+ "eval_runtime": 106.9004,
1745
+ "eval_samples_per_second": 60.393,
1746
+ "eval_steps_per_second": 7.549,
1747
+ "step": 2200
1748
+ },
1749
+ {
1750
+ "epoch": 1.369268897149938,
1751
+ "grad_norm": 4.513510227203369,
1752
+ "learning_rate": 6.30731102850062e-05,
1753
+ "loss": 0.2435,
1754
+ "step": 2210
1755
+ },
1756
+ {
1757
+ "epoch": 1.3754646840148699,
1758
+ "grad_norm": 0.9935147762298584,
1759
+ "learning_rate": 6.245353159851302e-05,
1760
+ "loss": 0.237,
1761
+ "step": 2220
1762
+ },
1763
+ {
1764
+ "epoch": 1.3816604708798017,
1765
+ "grad_norm": 5.048427104949951,
1766
+ "learning_rate": 6.183395291201983e-05,
1767
+ "loss": 0.1811,
1768
+ "step": 2230
1769
+ },
1770
+ {
1771
+ "epoch": 1.3878562577447335,
1772
+ "grad_norm": 4.882187366485596,
1773
+ "learning_rate": 6.121437422552665e-05,
1774
+ "loss": 0.2591,
1775
+ "step": 2240
1776
+ },
1777
+ {
1778
+ "epoch": 1.3940520446096654,
1779
+ "grad_norm": 3.1441776752471924,
1780
+ "learning_rate": 6.0594795539033456e-05,
1781
+ "loss": 0.2503,
1782
+ "step": 2250
1783
+ },
1784
+ {
1785
+ "epoch": 1.4002478314745972,
1786
+ "grad_norm": 5.158385753631592,
1787
+ "learning_rate": 5.997521685254027e-05,
1788
+ "loss": 0.3771,
1789
+ "step": 2260
1790
+ },
1791
+ {
1792
+ "epoch": 1.4064436183395292,
1793
+ "grad_norm": 4.472280979156494,
1794
+ "learning_rate": 5.935563816604709e-05,
1795
+ "loss": 0.2673,
1796
+ "step": 2270
1797
+ },
1798
+ {
1799
+ "epoch": 1.412639405204461,
1800
+ "grad_norm": 3.1100497245788574,
1801
+ "learning_rate": 5.8736059479553906e-05,
1802
+ "loss": 0.3471,
1803
+ "step": 2280
1804
+ },
1805
+ {
1806
+ "epoch": 1.418835192069393,
1807
+ "grad_norm": 1.5550055503845215,
1808
+ "learning_rate": 5.811648079306072e-05,
1809
+ "loss": 0.2452,
1810
+ "step": 2290
1811
+ },
1812
+ {
1813
+ "epoch": 1.4250309789343247,
1814
+ "grad_norm": 1.051927924156189,
1815
+ "learning_rate": 5.749690210656754e-05,
1816
+ "loss": 0.2384,
1817
+ "step": 2300
1818
+ },
1819
+ {
1820
+ "epoch": 1.4250309789343247,
1821
+ "eval_accuracy": 0.8712825278810409,
1822
+ "eval_loss": 0.4928523004055023,
1823
+ "eval_runtime": 106.1123,
1824
+ "eval_samples_per_second": 60.841,
1825
+ "eval_steps_per_second": 7.605,
1826
+ "step": 2300
1827
+ },
1828
+ {
1829
+ "epoch": 1.4312267657992566,
1830
+ "grad_norm": 3.616842031478882,
1831
+ "learning_rate": 5.687732342007436e-05,
1832
+ "loss": 0.251,
1833
+ "step": 2310
1834
+ },
1835
+ {
1836
+ "epoch": 1.4374225526641884,
1837
+ "grad_norm": 3.94722580909729,
1838
+ "learning_rate": 5.625774473358117e-05,
1839
+ "loss": 0.1996,
1840
+ "step": 2320
1841
+ },
1842
+ {
1843
+ "epoch": 1.4436183395291202,
1844
+ "grad_norm": 6.4395222663879395,
1845
+ "learning_rate": 5.5638166047087984e-05,
1846
+ "loss": 0.3224,
1847
+ "step": 2330
1848
+ },
1849
+ {
1850
+ "epoch": 1.449814126394052,
1851
+ "grad_norm": 2.801499366760254,
1852
+ "learning_rate": 5.50185873605948e-05,
1853
+ "loss": 0.2018,
1854
+ "step": 2340
1855
+ },
1856
+ {
1857
+ "epoch": 1.4560099132589839,
1858
+ "grad_norm": 4.119659900665283,
1859
+ "learning_rate": 5.439900867410162e-05,
1860
+ "loss": 0.2661,
1861
+ "step": 2350
1862
+ },
1863
+ {
1864
+ "epoch": 1.4622057001239157,
1865
+ "grad_norm": 8.405607223510742,
1866
+ "learning_rate": 5.3779429987608434e-05,
1867
+ "loss": 0.3412,
1868
+ "step": 2360
1869
+ },
1870
+ {
1871
+ "epoch": 1.4684014869888475,
1872
+ "grad_norm": 3.3943393230438232,
1873
+ "learning_rate": 5.315985130111525e-05,
1874
+ "loss": 0.303,
1875
+ "step": 2370
1876
+ },
1877
+ {
1878
+ "epoch": 1.4745972738537794,
1879
+ "grad_norm": 3.990785837173462,
1880
+ "learning_rate": 5.2540272614622054e-05,
1881
+ "loss": 0.1961,
1882
+ "step": 2380
1883
+ },
1884
+ {
1885
+ "epoch": 1.4807930607187112,
1886
+ "grad_norm": 8.141942977905273,
1887
+ "learning_rate": 5.192069392812887e-05,
1888
+ "loss": 0.2683,
1889
+ "step": 2390
1890
+ },
1891
+ {
1892
+ "epoch": 1.486988847583643,
1893
+ "grad_norm": 2.5247440338134766,
1894
+ "learning_rate": 5.130111524163569e-05,
1895
+ "loss": 0.2397,
1896
+ "step": 2400
1897
+ },
1898
+ {
1899
+ "epoch": 1.486988847583643,
1900
+ "eval_accuracy": 0.8742255266418835,
1901
+ "eval_loss": 0.4664279520511627,
1902
+ "eval_runtime": 106.5781,
1903
+ "eval_samples_per_second": 60.575,
1904
+ "eval_steps_per_second": 7.572,
1905
+ "step": 2400
1906
+ },
1907
+ {
1908
+ "epoch": 1.4931846344485749,
1909
+ "grad_norm": 2.132350206375122,
1910
+ "learning_rate": 5.0681536555142505e-05,
1911
+ "loss": 0.3258,
1912
+ "step": 2410
1913
+ },
1914
+ {
1915
+ "epoch": 1.4993804213135067,
1916
+ "grad_norm": 4.359376430511475,
1917
+ "learning_rate": 5.006195786864932e-05,
1918
+ "loss": 0.2781,
1919
+ "step": 2420
1920
+ },
1921
+ {
1922
+ "epoch": 1.5055762081784385,
1923
+ "grad_norm": 7.171940326690674,
1924
+ "learning_rate": 4.944237918215613e-05,
1925
+ "loss": 0.2828,
1926
+ "step": 2430
1927
+ },
1928
+ {
1929
+ "epoch": 1.5117719950433703,
1930
+ "grad_norm": 1.664962649345398,
1931
+ "learning_rate": 4.882280049566295e-05,
1932
+ "loss": 0.206,
1933
+ "step": 2440
1934
+ },
1935
+ {
1936
+ "epoch": 1.5179677819083024,
1937
+ "grad_norm": 7.0570268630981445,
1938
+ "learning_rate": 4.820322180916977e-05,
1939
+ "loss": 0.2365,
1940
+ "step": 2450
1941
+ },
1942
+ {
1943
+ "epoch": 1.5241635687732342,
1944
+ "grad_norm": 4.6403279304504395,
1945
+ "learning_rate": 4.758364312267658e-05,
1946
+ "loss": 0.3308,
1947
+ "step": 2460
1948
+ },
1949
+ {
1950
+ "epoch": 1.530359355638166,
1951
+ "grad_norm": 0.445726215839386,
1952
+ "learning_rate": 4.69640644361834e-05,
1953
+ "loss": 0.171,
1954
+ "step": 2470
1955
+ },
1956
+ {
1957
+ "epoch": 1.5365551425030979,
1958
+ "grad_norm": 6.475937366485596,
1959
+ "learning_rate": 4.6344485749690216e-05,
1960
+ "loss": 0.2941,
1961
+ "step": 2480
1962
+ },
1963
+ {
1964
+ "epoch": 1.5427509293680297,
1965
+ "grad_norm": 1.953753113746643,
1966
+ "learning_rate": 4.5724907063197026e-05,
1967
+ "loss": 0.1691,
1968
+ "step": 2490
1969
+ },
1970
+ {
1971
+ "epoch": 1.5489467162329615,
1972
+ "grad_norm": 4.187342166900635,
1973
+ "learning_rate": 4.510532837670384e-05,
1974
+ "loss": 0.3448,
1975
+ "step": 2500
1976
+ },
1977
+ {
1978
+ "epoch": 1.5489467162329615,
1979
+ "eval_accuracy": 0.8754646840148699,
1980
+ "eval_loss": 0.46897682547569275,
1981
+ "eval_runtime": 107.2096,
1982
+ "eval_samples_per_second": 60.218,
1983
+ "eval_steps_per_second": 7.527,
1984
+ "step": 2500
1985
+ },
1986
+ {
1987
+ "epoch": 1.5551425030978936,
1988
+ "grad_norm": 4.653651714324951,
1989
+ "learning_rate": 4.448574969021066e-05,
1990
+ "loss": 0.2035,
1991
+ "step": 2510
1992
+ },
1993
+ {
1994
+ "epoch": 1.5613382899628254,
1995
+ "grad_norm": 1.159033179283142,
1996
+ "learning_rate": 4.3866171003717476e-05,
1997
+ "loss": 0.2354,
1998
+ "step": 2520
1999
+ },
2000
+ {
2001
+ "epoch": 1.5675340768277573,
2002
+ "grad_norm": 0.841773271560669,
2003
+ "learning_rate": 4.3246592317224286e-05,
2004
+ "loss": 0.1738,
2005
+ "step": 2530
2006
+ },
2007
+ {
2008
+ "epoch": 1.573729863692689,
2009
+ "grad_norm": 6.38914155960083,
2010
+ "learning_rate": 4.26270136307311e-05,
2011
+ "loss": 0.2743,
2012
+ "step": 2540
2013
+ },
2014
+ {
2015
+ "epoch": 1.579925650557621,
2016
+ "grad_norm": 0.2980528473854065,
2017
+ "learning_rate": 4.200743494423792e-05,
2018
+ "loss": 0.2383,
2019
+ "step": 2550
2020
+ },
2021
+ {
2022
+ "epoch": 1.5861214374225527,
2023
+ "grad_norm": 3.9257161617279053,
2024
+ "learning_rate": 4.1387856257744737e-05,
2025
+ "loss": 0.1728,
2026
+ "step": 2560
2027
+ },
2028
+ {
2029
+ "epoch": 1.5923172242874846,
2030
+ "grad_norm": 4.1586785316467285,
2031
+ "learning_rate": 4.0768277571251553e-05,
2032
+ "loss": 0.2226,
2033
+ "step": 2570
2034
+ },
2035
+ {
2036
+ "epoch": 1.5985130111524164,
2037
+ "grad_norm": 1.591169834136963,
2038
+ "learning_rate": 4.014869888475837e-05,
2039
+ "loss": 0.1998,
2040
+ "step": 2580
2041
+ },
2042
+ {
2043
+ "epoch": 1.6047087980173482,
2044
+ "grad_norm": 1.5540215969085693,
2045
+ "learning_rate": 3.952912019826518e-05,
2046
+ "loss": 0.101,
2047
+ "step": 2590
2048
+ },
2049
+ {
2050
+ "epoch": 1.61090458488228,
2051
+ "grad_norm": 1.4260759353637695,
2052
+ "learning_rate": 3.8909541511772e-05,
2053
+ "loss": 0.3129,
2054
+ "step": 2600
2055
+ },
2056
+ {
2057
+ "epoch": 1.61090458488228,
2058
+ "eval_accuracy": 0.8842936802973977,
2059
+ "eval_loss": 0.4350809156894684,
2060
+ "eval_runtime": 106.4924,
2061
+ "eval_samples_per_second": 60.624,
2062
+ "eval_steps_per_second": 7.578,
2063
+ "step": 2600
2064
+ },
2065
+ {
2066
+ "epoch": 1.6171003717472119,
2067
+ "grad_norm": 4.335544586181641,
2068
+ "learning_rate": 3.8289962825278814e-05,
2069
+ "loss": 0.2159,
2070
+ "step": 2610
2071
+ },
2072
+ {
2073
+ "epoch": 1.6232961586121437,
2074
+ "grad_norm": 1.1200919151306152,
2075
+ "learning_rate": 3.7670384138785624e-05,
2076
+ "loss": 0.2679,
2077
+ "step": 2620
2078
+ },
2079
+ {
2080
+ "epoch": 1.6294919454770755,
2081
+ "grad_norm": 2.773334264755249,
2082
+ "learning_rate": 3.705080545229244e-05,
2083
+ "loss": 0.2927,
2084
+ "step": 2630
2085
+ },
2086
+ {
2087
+ "epoch": 1.6356877323420074,
2088
+ "grad_norm": 2.635826349258423,
2089
+ "learning_rate": 3.643122676579926e-05,
2090
+ "loss": 0.2937,
2091
+ "step": 2640
2092
+ },
2093
+ {
2094
+ "epoch": 1.6418835192069392,
2095
+ "grad_norm": 2.025951385498047,
2096
+ "learning_rate": 3.5811648079306074e-05,
2097
+ "loss": 0.2392,
2098
+ "step": 2650
2099
+ },
2100
+ {
2101
+ "epoch": 1.648079306071871,
2102
+ "grad_norm": 9.39108943939209,
2103
+ "learning_rate": 3.5192069392812884e-05,
2104
+ "loss": 0.3255,
2105
+ "step": 2660
2106
+ },
2107
+ {
2108
+ "epoch": 1.6542750929368029,
2109
+ "grad_norm": 1.8526005744934082,
2110
+ "learning_rate": 3.45724907063197e-05,
2111
+ "loss": 0.1944,
2112
+ "step": 2670
2113
+ },
2114
+ {
2115
+ "epoch": 1.6604708798017347,
2116
+ "grad_norm": 0.7823792695999146,
2117
+ "learning_rate": 3.3952912019826525e-05,
2118
+ "loss": 0.149,
2119
+ "step": 2680
2120
+ },
2121
+ {
2122
+ "epoch": 1.6666666666666665,
2123
+ "grad_norm": 7.365529537200928,
2124
+ "learning_rate": 3.3333333333333335e-05,
2125
+ "loss": 0.312,
2126
+ "step": 2690
2127
+ },
2128
+ {
2129
+ "epoch": 1.6728624535315983,
2130
+ "grad_norm": 3.5820822715759277,
2131
+ "learning_rate": 3.271375464684015e-05,
2132
+ "loss": 0.1027,
2133
+ "step": 2700
2134
+ },
2135
+ {
2136
+ "epoch": 1.6728624535315983,
2137
+ "eval_accuracy": 0.8846034696406444,
2138
+ "eval_loss": 0.4310809373855591,
2139
+ "eval_runtime": 106.7243,
2140
+ "eval_samples_per_second": 60.492,
2141
+ "eval_steps_per_second": 7.562,
2142
+ "step": 2700
2143
+ },
2144
+ {
2145
+ "epoch": 1.6790582403965304,
2146
+ "grad_norm": 0.8703699111938477,
2147
+ "learning_rate": 3.209417596034697e-05,
2148
+ "loss": 0.1744,
2149
+ "step": 2710
2150
+ },
2151
+ {
2152
+ "epoch": 1.6852540272614622,
2153
+ "grad_norm": 3.368072986602783,
2154
+ "learning_rate": 3.147459727385378e-05,
2155
+ "loss": 0.2112,
2156
+ "step": 2720
2157
+ },
2158
+ {
2159
+ "epoch": 1.691449814126394,
2160
+ "grad_norm": 2.1907970905303955,
2161
+ "learning_rate": 3.0855018587360595e-05,
2162
+ "loss": 0.227,
2163
+ "step": 2730
2164
+ },
2165
+ {
2166
+ "epoch": 1.6976456009913259,
2167
+ "grad_norm": 3.725156784057617,
2168
+ "learning_rate": 3.0235439900867412e-05,
2169
+ "loss": 0.184,
2170
+ "step": 2740
2171
+ },
2172
+ {
2173
+ "epoch": 1.7038413878562577,
2174
+ "grad_norm": 0.17684808373451233,
2175
+ "learning_rate": 2.9615861214374226e-05,
2176
+ "loss": 0.1559,
2177
+ "step": 2750
2178
+ },
2179
+ {
2180
+ "epoch": 1.7100371747211895,
2181
+ "grad_norm": 5.654155731201172,
2182
+ "learning_rate": 2.8996282527881043e-05,
2183
+ "loss": 0.2551,
2184
+ "step": 2760
2185
+ },
2186
+ {
2187
+ "epoch": 1.7162329615861216,
2188
+ "grad_norm": 1.2272543907165527,
2189
+ "learning_rate": 2.837670384138786e-05,
2190
+ "loss": 0.2352,
2191
+ "step": 2770
2192
+ },
2193
+ {
2194
+ "epoch": 1.7224287484510534,
2195
+ "grad_norm": 0.7282238006591797,
2196
+ "learning_rate": 2.7757125154894676e-05,
2197
+ "loss": 0.2647,
2198
+ "step": 2780
2199
+ },
2200
+ {
2201
+ "epoch": 1.7286245353159853,
2202
+ "grad_norm": 2.949235677719116,
2203
+ "learning_rate": 2.7137546468401486e-05,
2204
+ "loss": 0.1656,
2205
+ "step": 2790
2206
+ },
2207
+ {
2208
+ "epoch": 1.734820322180917,
2209
+ "grad_norm": 5.691123962402344,
2210
+ "learning_rate": 2.6517967781908303e-05,
2211
+ "loss": 0.2086,
2212
+ "step": 2800
2213
+ },
2214
+ {
2215
+ "epoch": 1.734820322180917,
2216
+ "eval_accuracy": 0.8897149938042132,
2217
+ "eval_loss": 0.4087870121002197,
2218
+ "eval_runtime": 107.499,
2219
+ "eval_samples_per_second": 60.056,
2220
+ "eval_steps_per_second": 7.507,
2221
+ "step": 2800
2222
+ },
2223
+ {
2224
+ "epoch": 1.741016109045849,
2225
+ "grad_norm": 0.18745893239974976,
2226
+ "learning_rate": 2.589838909541512e-05,
2227
+ "loss": 0.1878,
2228
+ "step": 2810
2229
+ },
2230
+ {
2231
+ "epoch": 1.7472118959107807,
2232
+ "grad_norm": 6.888800621032715,
2233
+ "learning_rate": 2.5278810408921933e-05,
2234
+ "loss": 0.1727,
2235
+ "step": 2820
2236
+ },
2237
+ {
2238
+ "epoch": 1.7534076827757126,
2239
+ "grad_norm": 1.913245439529419,
2240
+ "learning_rate": 2.465923172242875e-05,
2241
+ "loss": 0.1732,
2242
+ "step": 2830
2243
+ },
2244
+ {
2245
+ "epoch": 1.7596034696406444,
2246
+ "grad_norm": 2.651405096054077,
2247
+ "learning_rate": 2.4039653035935564e-05,
2248
+ "loss": 0.1928,
2249
+ "step": 2840
2250
+ },
2251
+ {
2252
+ "epoch": 1.7657992565055762,
2253
+ "grad_norm": 1.3924200534820557,
2254
+ "learning_rate": 2.342007434944238e-05,
2255
+ "loss": 0.1397,
2256
+ "step": 2850
2257
+ },
2258
+ {
2259
+ "epoch": 1.771995043370508,
2260
+ "grad_norm": 7.028463840484619,
2261
+ "learning_rate": 2.2800495662949194e-05,
2262
+ "loss": 0.2461,
2263
+ "step": 2860
2264
+ },
2265
+ {
2266
+ "epoch": 1.77819083023544,
2267
+ "grad_norm": 1.74459707736969,
2268
+ "learning_rate": 2.218091697645601e-05,
2269
+ "loss": 0.137,
2270
+ "step": 2870
2271
+ },
2272
+ {
2273
+ "epoch": 1.7843866171003717,
2274
+ "grad_norm": 0.35962000489234924,
2275
+ "learning_rate": 2.1561338289962827e-05,
2276
+ "loss": 0.1767,
2277
+ "step": 2880
2278
+ },
2279
+ {
2280
+ "epoch": 1.7905824039653035,
2281
+ "grad_norm": 1.6832956075668335,
2282
+ "learning_rate": 2.094175960346964e-05,
2283
+ "loss": 0.2574,
2284
+ "step": 2890
2285
+ },
2286
+ {
2287
+ "epoch": 1.7967781908302354,
2288
+ "grad_norm": 5.7223029136657715,
2289
+ "learning_rate": 2.0322180916976458e-05,
2290
+ "loss": 0.1683,
2291
+ "step": 2900
2292
+ },
2293
+ {
2294
+ "epoch": 1.7967781908302354,
2295
+ "eval_accuracy": 0.8918835192069393,
2296
+ "eval_loss": 0.41334882378578186,
2297
+ "eval_runtime": 106.6472,
2298
+ "eval_samples_per_second": 60.536,
2299
+ "eval_steps_per_second": 7.567,
2300
+ "step": 2900
2301
+ },
2302
+ {
2303
+ "epoch": 1.8029739776951672,
2304
+ "grad_norm": 6.862188339233398,
2305
+ "learning_rate": 1.970260223048327e-05,
2306
+ "loss": 0.1923,
2307
+ "step": 2910
2308
+ },
2309
+ {
2310
+ "epoch": 1.809169764560099,
2311
+ "grad_norm": 1.456938624382019,
2312
+ "learning_rate": 1.9083023543990088e-05,
2313
+ "loss": 0.167,
2314
+ "step": 2920
2315
+ },
2316
+ {
2317
+ "epoch": 1.8153655514250309,
2318
+ "grad_norm": 1.2363471984863281,
2319
+ "learning_rate": 1.8463444857496905e-05,
2320
+ "loss": 0.1129,
2321
+ "step": 2930
2322
+ },
2323
+ {
2324
+ "epoch": 1.8215613382899627,
2325
+ "grad_norm": 2.139641761779785,
2326
+ "learning_rate": 1.7843866171003718e-05,
2327
+ "loss": 0.2046,
2328
+ "step": 2940
2329
+ },
2330
+ {
2331
+ "epoch": 1.8277571251548945,
2332
+ "grad_norm": 7.9193620681762695,
2333
+ "learning_rate": 1.7224287484510535e-05,
2334
+ "loss": 0.1843,
2335
+ "step": 2950
2336
+ },
2337
+ {
2338
+ "epoch": 1.8339529120198264,
2339
+ "grad_norm": 0.4252839684486389,
2340
+ "learning_rate": 1.660470879801735e-05,
2341
+ "loss": 0.1719,
2342
+ "step": 2960
2343
+ },
2344
+ {
2345
+ "epoch": 1.8401486988847584,
2346
+ "grad_norm": 2.5650665760040283,
2347
+ "learning_rate": 1.5985130111524162e-05,
2348
+ "loss": 0.2294,
2349
+ "step": 2970
2350
+ },
2351
+ {
2352
+ "epoch": 1.8463444857496902,
2353
+ "grad_norm": 4.639560222625732,
2354
+ "learning_rate": 1.536555142503098e-05,
2355
+ "loss": 0.163,
2356
+ "step": 2980
2357
+ },
2358
+ {
2359
+ "epoch": 1.852540272614622,
2360
+ "grad_norm": 1.6396369934082031,
2361
+ "learning_rate": 1.4745972738537794e-05,
2362
+ "loss": 0.0855,
2363
+ "step": 2990
2364
+ },
2365
+ {
2366
+ "epoch": 1.858736059479554,
2367
+ "grad_norm": 3.6492340564727783,
2368
+ "learning_rate": 1.412639405204461e-05,
2369
+ "loss": 0.2767,
2370
+ "step": 3000
2371
+ },
2372
+ {
2373
+ "epoch": 1.858736059479554,
2374
+ "eval_accuracy": 0.8963754646840149,
2375
+ "eval_loss": 0.38507798314094543,
2376
+ "eval_runtime": 106.3075,
2377
+ "eval_samples_per_second": 60.73,
2378
+ "eval_steps_per_second": 7.591,
2379
+ "step": 3000
2380
+ },
2381
+ {
2382
+ "epoch": 1.8649318463444857,
2383
+ "grad_norm": 2.8184783458709717,
2384
+ "learning_rate": 1.3506815365551426e-05,
2385
+ "loss": 0.1552,
2386
+ "step": 3010
2387
+ },
2388
+ {
2389
+ "epoch": 1.8711276332094176,
2390
+ "grad_norm": 0.4257136881351471,
2391
+ "learning_rate": 1.288723667905824e-05,
2392
+ "loss": 0.169,
2393
+ "step": 3020
2394
+ },
2395
+ {
2396
+ "epoch": 1.8773234200743496,
2397
+ "grad_norm": 6.326307773590088,
2398
+ "learning_rate": 1.2267657992565058e-05,
2399
+ "loss": 0.1842,
2400
+ "step": 3030
2401
+ },
2402
+ {
2403
+ "epoch": 1.8835192069392814,
2404
+ "grad_norm": 0.6836357712745667,
2405
+ "learning_rate": 1.1648079306071871e-05,
2406
+ "loss": 0.1587,
2407
+ "step": 3040
2408
+ },
2409
+ {
2410
+ "epoch": 1.8897149938042133,
2411
+ "grad_norm": 6.755892753601074,
2412
+ "learning_rate": 1.1028500619578686e-05,
2413
+ "loss": 0.1954,
2414
+ "step": 3050
2415
+ },
2416
+ {
2417
+ "epoch": 1.895910780669145,
2418
+ "grad_norm": 2.74873948097229,
2419
+ "learning_rate": 1.0408921933085503e-05,
2420
+ "loss": 0.1719,
2421
+ "step": 3060
2422
+ },
2423
+ {
2424
+ "epoch": 1.902106567534077,
2425
+ "grad_norm": 4.230051517486572,
2426
+ "learning_rate": 9.789343246592318e-06,
2427
+ "loss": 0.1562,
2428
+ "step": 3070
2429
+ },
2430
+ {
2431
+ "epoch": 1.9083023543990087,
2432
+ "grad_norm": 5.3604512214660645,
2433
+ "learning_rate": 9.169764560099132e-06,
2434
+ "loss": 0.2365,
2435
+ "step": 3080
2436
+ },
2437
+ {
2438
+ "epoch": 1.9144981412639406,
2439
+ "grad_norm": 0.3976893723011017,
2440
+ "learning_rate": 8.550185873605949e-06,
2441
+ "loss": 0.1322,
2442
+ "step": 3090
2443
+ },
2444
+ {
2445
+ "epoch": 1.9206939281288724,
2446
+ "grad_norm": 3.6880292892456055,
2447
+ "learning_rate": 7.930607187112764e-06,
2448
+ "loss": 0.1582,
2449
+ "step": 3100
2450
+ },
2451
+ {
2452
+ "epoch": 1.9206939281288724,
2453
+ "eval_accuracy": 0.9017967781908303,
2454
+ "eval_loss": 0.3703024089336395,
2455
+ "eval_runtime": 107.4091,
2456
+ "eval_samples_per_second": 60.107,
2457
+ "eval_steps_per_second": 7.513,
2458
+ "step": 3100
2459
+ },
2460
+ {
2461
+ "epoch": 1.9268897149938042,
2462
+ "grad_norm": 9.823491096496582,
2463
+ "learning_rate": 7.31102850061958e-06,
2464
+ "loss": 0.2671,
2465
+ "step": 3110
2466
+ },
2467
+ {
2468
+ "epoch": 1.933085501858736,
2469
+ "grad_norm": 5.724573135375977,
2470
+ "learning_rate": 6.691449814126394e-06,
2471
+ "loss": 0.2088,
2472
+ "step": 3120
2473
+ },
2474
+ {
2475
+ "epoch": 1.939281288723668,
2476
+ "grad_norm": 6.375148773193359,
2477
+ "learning_rate": 6.071871127633209e-06,
2478
+ "loss": 0.1613,
2479
+ "step": 3130
2480
+ },
2481
+ {
2482
+ "epoch": 1.9454770755885997,
2483
+ "grad_norm": 3.657437324523926,
2484
+ "learning_rate": 5.452292441140025e-06,
2485
+ "loss": 0.1824,
2486
+ "step": 3140
2487
+ },
2488
+ {
2489
+ "epoch": 1.9516728624535316,
2490
+ "grad_norm": 0.8294070959091187,
2491
+ "learning_rate": 4.832713754646841e-06,
2492
+ "loss": 0.1733,
2493
+ "step": 3150
2494
+ },
2495
+ {
2496
+ "epoch": 1.9578686493184634,
2497
+ "grad_norm": 2.991377592086792,
2498
+ "learning_rate": 4.213135068153655e-06,
2499
+ "loss": 0.2467,
2500
+ "step": 3160
2501
+ },
2502
+ {
2503
+ "epoch": 1.9640644361833952,
2504
+ "grad_norm": 3.435967445373535,
2505
+ "learning_rate": 3.5935563816604712e-06,
2506
+ "loss": 0.1882,
2507
+ "step": 3170
2508
+ },
2509
+ {
2510
+ "epoch": 1.970260223048327,
2511
+ "grad_norm": 4.247952938079834,
2512
+ "learning_rate": 2.9739776951672864e-06,
2513
+ "loss": 0.1783,
2514
+ "step": 3180
2515
+ },
2516
+ {
2517
+ "epoch": 1.9764560099132589,
2518
+ "grad_norm": 0.5351110100746155,
2519
+ "learning_rate": 2.3543990086741015e-06,
2520
+ "loss": 0.1373,
2521
+ "step": 3190
2522
+ },
2523
+ {
2524
+ "epoch": 1.9826517967781907,
2525
+ "grad_norm": 2.167306661605835,
2526
+ "learning_rate": 1.7348203221809173e-06,
2527
+ "loss": 0.1421,
2528
+ "step": 3200
2529
+ },
2530
+ {
2531
+ "epoch": 1.9826517967781907,
2532
+ "eval_accuracy": 0.90272614622057,
2533
+ "eval_loss": 0.36434125900268555,
2534
+ "eval_runtime": 107.4144,
2535
+ "eval_samples_per_second": 60.104,
2536
+ "eval_steps_per_second": 7.513,
2537
+ "step": 3200
2538
+ },
2539
+ {
2540
+ "epoch": 1.9888475836431225,
2541
+ "grad_norm": 1.093064785003662,
2542
+ "learning_rate": 1.1152416356877324e-06,
2543
+ "loss": 0.1911,
2544
+ "step": 3210
2545
+ },
2546
+ {
2547
+ "epoch": 1.9950433705080544,
2548
+ "grad_norm": 3.7282161712646484,
2549
+ "learning_rate": 4.956629491945477e-07,
2550
+ "loss": 0.2041,
2551
+ "step": 3220
2552
+ },
2553
+ {
2554
+ "epoch": 2.0,
2555
+ "step": 3228,
2556
+ "total_flos": 4.004423768814723e+18,
2557
+ "train_loss": 0.9726454161726114,
2558
+ "train_runtime": 5145.5507,
2559
+ "train_samples_per_second": 10.036,
2560
+ "train_steps_per_second": 0.627
2561
+ }
2562
+ ],
2563
+ "logging_steps": 10,
2564
+ "max_steps": 3228,
2565
+ "num_input_tokens_seen": 0,
2566
+ "num_train_epochs": 2,
2567
+ "save_steps": 100,
2568
+ "stateful_callbacks": {
2569
+ "TrainerControl": {
2570
+ "args": {
2571
+ "should_epoch_stop": false,
2572
+ "should_evaluate": false,
2573
+ "should_log": false,
2574
+ "should_save": true,
2575
+ "should_training_stop": true
2576
+ },
2577
+ "attributes": {}
2578
+ }
2579
+ },
2580
+ "total_flos": 4.004423768814723e+18,
2581
+ "train_batch_size": 16,
2582
+ "trial_name": null,
2583
+ "trial_params": null
2584
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8146605fe6779725eb5393f6c0d83f33144126bccad7a64e6d9e3b6963548e80
3
+ size 5176